示例#1
0
    def setUp(self, **kwargs):
        TestBase.setUp(self)
        dnaseq = testutil.datafile('dnaseq.fasta')
        tryannot = testutil.tempdatafile('tryannot')

        db = seqdb.BlastDB(dnaseq)
        try:
            db.__doc__ = 'little dna'

            self.pygrData.Bio.Test.dna = db
            annoDB = seqdb.AnnotationDB({1: ('seq1', 5, 10, 'fred'),
                                         2: ('seq1', -60, -50, 'bob'),
                                         3: ('seq2', -20, -10, 'mary')},
                                        db,
                                  sliceAttrDict=dict(id=0, start=1, stop=2,
                                                     name=3))
            annoDB.__doc__ = 'trivial annotation'
            self.pygrData.Bio.Test.annoDB = annoDB
            nlmsa = cnestedlist.NLMSA(tryannot, 'w', pairwiseMode=True,
                                      bidirectional=False)
            try:
                for annID in annoDB:
                    nlmsa.addAnnotation(annoDB[annID])

                nlmsa.build()
                nlmsa.__doc__ = 'trivial map'
                self.pygrData.Bio.Test.map = nlmsa
                self.schema.Bio.Test.map = metabase.ManyToManyRelation(db,
                                                 annoDB, bindAttrs=('exons', ))
                self.metabase.commit()
                self.metabase.clear_cache()
            finally:
                nlmsa.close()
        finally:
            db.close()
示例#2
0
def makeNLMSA(annotDBList, dataPath='memory'):
    if dataPath == 'memory':
        annotMap = cnestedlist.NLMSA(dataPath, 'memory', pairwiseMode=True)
    else:
        annotMap = cnestedlist.NLMSA(dataPath, 'w', pairwiseMode=True)
    annotMap.__doc__ = 'NLMSA built against '
    for annotDB in annotDBList:
        annotMap.__doc__ += ' %s, ' % (annotDB.__doc__)
        print '# Adding annotations to NLMSA from %s...' % annotDB.__doc__
        for annot in annotDB.values():
            annotMap.addAnnotation(annot)
    print '# Building annotation map...'
    if dataPath == 'memory':
        annotMap.build()
    else:
        annotMap.build(saveSeqDict=True)
    return annotMap
示例#3
0
def bed2pygr(dbprefix, referencefile, bedfile, indir):

    collision_counter = defaultdict(int)
    chrdb = seqdb.SequenceFileDB(referencefile)
    annodb = annotation.AnnotationDB({}, chrdb)

    al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True)

    load_bed(al, annodb, bedfile, collision_counter)

    al.build(saveSeqDict=True)

    genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0]
    print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
示例#4
0
def populate_swissprot():
    "Populate the current worldbase with swissprot data"
    # build BlastDB out of the sequences
    sp_hbb1 = testutil.datafile('sp_hbb1')
    sp = seqdb.BlastDB(sp_hbb1)
    sp.__doc__ = 'little swissprot'
    worldbase.Bio.Seq.Swissprot.sp42 = sp

    # also store a fragment
    hbb = sp['HBB1_TORMA']
    ival = hbb[10:35]
    ival.__doc__ = 'fragment'
    worldbase.Bio.Seq.frag = ival

    # build a mapping to itself
    m = mapping.Mapping(sourceDB=sp, targetDB=sp)
    trypsin = sp['PRCA_ANAVA']
    m[hbb] = trypsin
    m.__doc__ = 'map sp to itself'
    worldbase.Bio.Seq.spmap = m

    # create an annotation database and bind as exons attribute
    worldbase.schema.Bio.Seq.spmap = metabase.OneToManyRelation(
        sp, sp, bindAttrs=('buddy', ))
    annoDB = seqdb.AnnotationDB({1: ('HBB1_TORMA', 10, 50)},
                                sp,
                                sliceAttrDict=dict(id=0, start=1, stop=2))
    exon = annoDB[1]

    # generate the names where these will be stored
    tempdir = testutil.TempDir('exonAnnot')
    filename = tempdir.subfile('cnested')
    nlmsa = cnestedlist.NLMSA(filename,
                              'w',
                              pairwiseMode=True,
                              bidirectional=False)
    nlmsa.addAnnotation(exon)
    nlmsa.build()
    annoDB.__doc__ = 'a little annotation db'
    nlmsa.__doc__ = 'a little map'
    worldbase.Bio.Annotation.annoDB = annoDB
    worldbase.Bio.Annotation.map = nlmsa
    worldbase.schema.Bio.Annotation.map = \
         metabase.ManyToManyRelation(sp, annoDB, bindAttrs=('exons', ))
示例#5
0
文件: analyze.py 项目: bethv/phenoseq
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS',
                        geneQualifier='gene'):
    '''construct annotation DB for gene CDS intervals.
    NB: this assumes each gene consists of ONE interval.
    This cannot be used for multi-exon genes!'''
    try:
        gbparse = SeqIO.parse(gbfile, 'genbank')
    except TypeError: # SeqIO changed its interface?
        ifile = open(gbfile)
        try:
            gbparse = SeqIO.parse(ifile, 'genbank')
            gbseqs = list(gbparse)
        finally:
            ifile.close()
    else:
        gbseqs = list(gbparse)
    if fastafile is None:
        fastafile = gbfile.split('.')[0] + '.fna'
    genome = seqdb.SequenceFileDB(fastafile)
    genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly
    annodb = annotation.AnnotationDB({}, genome,
                                     sliceAttrDict=dict(id=0, start=1, stop=2,
                                                        orientation=3))
    i = 0
    for s in gbseqs:
        seqID = genomeIndex[s.id].id # find the right seq and get its actual ID
        for f in s.features:
            if f.type == featureType:
                try:
                    name = f.qualifiers[geneQualifier][0]
                except KeyError: # keep the annotation even if label missing
                    warnings.warn('Missing gene qualifier "%s" on %s annotation'
                                  % (geneQualifier, featureType))
                    name = 'unlabeled_%s_%d' % (featureType, i)
                    i += 1
                annodb.new_annotation(name,
                        (seqID, f.location.start.position,
                         f.location.end.position, f.strand))
    al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True)
    for a in annodb.itervalues():
        al.addAnnotation(a)
    al.build()
    return annodb, al, genome
示例#6
0
文件: analyze.py 项目: bethv/phenoseq
def read_exon_annots(genome, genesFile='knownGene.txt'):
    '''read multi-exon transcript set and build exon annotation db
    and exon-to-gene mapping'''
    exonDict, genes, trLen = read_known_genes(genesFile)
    geneLengths = get_gene_maxlengths(genes, trLen)
    totalSize = sum(geneLengths.values())
    annodb = annotation.AnnotationDB({}, genome,
                                     sliceAttrDict=dict(id=0, orientation=1,
                                                        start=2, stop=3))
    al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True,
                           maxlen=1000000000)
    i = 0
    exonGene = {}
    for t,geneID in exonDict.iteritems():
        a = annodb.new_annotation(i, t)
        exonGene[i] = geneID
        i += 1
        al.addAnnotation(a)
    al.build()
    return annodb, al, exonGene, totalSize, geneLengths
示例#7
0
def bedToNLMSA(bedlines,
               genome,
               field_locations=dict(id=0,
                                    start=1,
                                    stop=2,
                                    name=3,
                                    score=4,
                                    orientation=-1)):
    "Build a pygr resource off of the BED file in_name"
    annotDB = annotation.AnnotationDB(None,
                                      genome,
                                      verbose=False,
                                      sliceAttrDict=field_locations)
    nlmsa = cnestedlist.NLMSA('tmp_bed',
                              mode='memory',
                              pairwiseMode=True,
                              bidirectional=False)
    index = 0
    skipped = 0
    for line in bedlines:
        if not line:
            continue
        fields = line.strip().split('\t')
        orientation = 1 if len(fields) < 6 or fields[5] == '+' else -1
        #print fields, orientation
        try:
            curAnnot = annotDB.new_annotation(index, fields + [orientation])
            nlmsa.addAnnotation(curAnnot)
            index += 1
        except KeyError as e:
            print ('Skipping row without matching chromosome: %s,' +\
                    'message: %s') % (row.id, e.message)
            skipped += 1
    #annotDB.close()
    nlmsa.build()
    return annotDB, nlmsa
示例#8
0
    def test_build(self):
        'Test building an NLMSA and querying results'
        from pygr import seqdb, cnestedlist
        genomedict = {}
        for orgstr in msaSpeciesList:
            genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' +
                                                       orgstr)
        uniondict = seqdb.PrefixUnionDict(genomedict)
        if smallSampleKey:
            maflist = (os.path.join(mafDir, smallSampleKey + '.maf'), )
        else:
            maflist = glob.glob(os.path.join(mafDir, '*.maf'))
            maflist.sort()
        msaname = os.path.join(self.path, 'dm2_multiz15way')
        # 500MB VERSION
        msa1 = cnestedlist.NLMSA(msaname,
                                 'w',
                                 uniondict,
                                 maflist,
                                 maxlen=536870912,
                                 maxint=22369620)
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        outfileName = os.path.join(testInputDir,
                                   'splicesite_dm2%s.txt' % smallSamplePostfix)
        outputName = os.path.join(
            testInputDir,
            'splicesite_dm2%s_multiz15way.txt' % smallSamplePostfix)
        newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()

        # TEXT<->BINARY TEST
        msafilelist = glob.glob(msaname + '*')
        msa.save_seq_dict()
        cnestedlist.dump_textfile(
            msaname, os.path.join(self.path, 'dm2_multiz15way.txt'))
        for filename in msafilelist:
            os.remove(filename)
        runPath = os.path.realpath(os.curdir)
        os.chdir(self.path)
        cnestedlist.textfile_to_binaries('dm2_multiz15way.txt')
        os.chdir(runPath)

        msa1 = cnestedlist.NLMSA(msaname, 'r')
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()
示例#9
0
def main():
    if len(sys.argv) < 4: raise SystemExit
    try:
        MIN_SCORE = float(sys.argv[5])
    except IndexError:
        pass

    print >> sys.stderr, 'Reading sequence databases...'
    queries = seqdb.SequenceFileDB(sys.argv[1])
    targets = seqdb.SequenceFileDB(sys.argv[2])
    print >> sys.stderr, len(queries), len(targets)
    try:
        align_file = open(sys.argv[3])
    except IOError as e:
        print >> sys.stderr, 'Error: check alignment file.'
        raise e

    aligndb = cnestedlist.NLMSA('alignment', mode='memory', pairwiseMode=True)

    print >> sys.stderr, 'Adding sequences to an alignment database...'
    # for n, target in enumerate(targets):
    #     aligndb += targets[target]
    #     if n % 1000 == 0: print >> sys.stderr, '...', n

    target_list = set()

    for c, al in enumerate(parse_alignments(align_file)):
        aligndb += targets[al.target]
        target_list.add(al.target)
        add_alignment(aligndb, al, targets, queries)
        if c % 100 == 0: print >> sys.stderr, '...', c

    print >> sys.stderr, 'Building the alignment database...'
    aligndb.build()

    print >> sys.stderr, 'Constructing alignment graphs...'
    graph = nx.Graph()
    for c, target in enumerate(target_list):
        try:
            sub_ival = targets[target]
            for src, dest, edge in aligndb[sub_ival].edges():
                source = repr(src).split('[')[0].lstrip('-')
                destination = repr(dest).split('[')[0].lstrip('-')
                graph.add_edge(source, destination)
        except KeyError:
            pass
        if c % 100 == 0: print >> sys.stderr, '...', c

    # nx.draw(graph)
    # plt.show()
    # print graph.nodes()
    logfile = open('assemgraph.log', 'w')
    visited_nodes = set()
    cluster_no = 0
    for node in graph.nodes():
        if node not in visited_nodes:
            filename1 = 'cluster_%d_targets' % cluster_no
            filename2 = 'cluster_%d_queries' % cluster_no
            ofile1 = open(filename1, 'w')
            ofile2 = open(filename2, 'w')
            print >> sys.stderr, \
                    'Writing cluster %d to a file...' % cluster_no,
            vnodes, max_length = (write_sequence(node, graph, targets, queries,
                                                 ofile1, ofile2))
            visited_nodes.update(vnodes)
            for n in vnodes:
                size = len(targets[n]) if n in targets else len(queries[n])
                print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size)
            ofile1.close()
            ofile2.close()
            print >> sys.stderr, '\ttotal nodes = %d' % len(vnodes)

            cluster_no += 1

    print >> logfile, '***finished***'
    logfile.close()
示例#10
0
文件: gfp.py 项目: wclee47/GFP
def main():

    als, als_chrDic, strDict = [], {}, {}

    #----------------------------------------------------------
    # Required parameters
    #----------------------------------------------------------
    infile = ''
    indir = ''
    outprefix = ''
    bl2seqPATH = ''

    #----------------------------------------------------------
    # Optional parameters
    #----------------------------------------------------------
    min_pair = 1  # Minimum # of discordant read-pairs
    min_span = 2  # Minimum # of fusion spanning reads
    min_cov = 10  # Minimum # of base-pairs for both genes.
    min_shift = 1  # Minimum shifting pattern(bp) around fusion point.

    if len(sys.argv) == 1:
        print
        print "GFP --- A tool to detect fusion genes using RNA-Seq"
        print "\nRequired parameters"
        print "\t-i <string>             GSNAP result file."
        print "\t-d <string>             Pre-built exon index directory."
        print "\t-o <string>             Output prefix."
        print "\t--bl2seq <string>       bl2seq excutable path."
        print
        print "Optional parameters"
        print "\t--mpair <integer>       Minimum # of discordant read-pairs, DEFAULT: %d." % min_pair
        print "\t--mspan <integer>       Minumum # of fusion spanning reads, DEFAULT: %d." % min_span
        print "\t--mcov <integer>        Minimum # of base-pairs for both genes, DEFAULT: %d." % min_cov
        print "\t--mshift <integer>      Minimum # of shifting pattern(bp), DEFAULT: %d." % min_shift
        print
        sys.exit(1)
    opts, args = getopt.getopt(
        sys.argv[1:], "i:d:o:",
        ["bl2seq=", "mpair=", "mspan=", "mcov=", "mshift="])
    for opt, arg in opts:
        if opt == "-i": infile = arg
        elif opt == "-d": indir = arg
        elif opt == "-o": outprefix = arg
        elif opt == "--bl2seq": bl2seqPATH = arg
        elif opt == "--mpair": min_pair = int(arg)
        elif opt == "--mspan": min_span = int(arg)
        elif opt == "--mcov": min_cov = int(arg)
        elif opt == "--mshift": min_shift = int(arg)

    # Generate strand dictionary
    for line in open(os.path.join(indir, "transcript.bed")).readlines():
        fields = line.rstrip().split("\t")
        strDict[fields[3]] = fields[-1]

    # Preprocessing for pygr
    print str(datetime.now()) + "\tPreprocessing pygr requirements..."
    bedfiles = []
    for file in os.listdir(indir):
        if file.endswith(".bed") and file != "transcript.bed":
            bedfiles.append(file)
    for i in range(len(bedfiles)):
        als.append(
            cnestedlist.NLMSA(os.path.join(indir, bedfiles[i].split('.')[0]),
                              'r',
                              pairwiseMode=True))
        als_chrDic[bedfiles[i].split('.')[0]] = i

    # Read GSNAP result
    print str(datetime.now()
              ) + "\tReading GSNAP result & extracting fusion evidence..."
    read1Exons, read2Exons = '', ''
    aligns1, aligns2 = [], []
    poss1, poss2 = '', ''
    strand1, strand2 = '', ''
    fi = open(infile, 'r')
    fo = open(outprefix + "_raw.txt", 'w')
    while 1:
        line = fi.readline()
        if not line: break

        if line.startswith('>'):  # Read1
            aligns1 = []
            read1Exons = ''
            if line.split()[1] != '1': continue  # Skip multiply-mapped read
            while 1:
                line = fi.readline()
                if line == "\n": break
                aligns1.append(line)
            poss1, read1Exons, strand1 = exonMapper(aligns1, fo, indir, als,
                                                    als_chrDic, strDict)

        if line.startswith('<'):  # Read2
            aligns2 = []
            read2Exons = ''
            if line.split()[1] != '1': continue
            while 1:
                line = fi.readline()
                if line == "\n": break
                aligns2.append(line)
            poss2, read2Exons, strand2 = exonMapper(aligns2, fo, indir, als,
                                                    als_chrDic, strDict)

            if read1Exons != '' and read2Exons != '':
                sameFlg = False
                sep = re.compile("[;|]+")
                for read1exon in re.split(sep, read1Exons):
                    for read2exon in re.split(sep, read2Exons):
                        if read1exon.split('.')[1] == read2exon.split('.')[1]:
                            sameFlg = True

                if not sameFlg:  # Putative fusion pair

                    # Donor check module!
                    ts1, ts2 = [], []  # Transcript strand read1/read2
                    for read1exon in re.split(sep, read1Exons):
                        for strand in strDict[read1exon.split('.')[0]].split(
                                '/'):
                            if not strand in ts1: ts1.append(strand)
                    for read2exon in re.split(sep, read2Exons):
                        for strand in strDict[read2exon.split('.')[0]].split(
                                '/'):
                            if not strand in ts2: ts2.append(strand)
                    donor = check_pDonor(ts1, ts2, strand1, strand2)

                    if donor != "NA":
                        ctext = "pINTER"
                        for pos1 in poss1.split(';'):
                            for pos2 in poss2.split(';'):
                                if pos1.split(':')[0] == pos2.split(':')[0]:
                                    ctext = "pINTRA"
                        # No swap!
                        fo.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                 (ctext, poss1, poss2, read1Exons, read2Exons,
                                  strand1, strand2, donor))

    fo.close()
    fi.close()
    strDict.clear()

    #---------------------------------------------------
    # Extracting fusion evidences is terminated.
    # Generating gene fusion candidates will start.
    #---------------------------------------------------

    print str(
        datetime.now()) + "\tEstimating homology & distance between genes..."
    txPos, txFASTA = {}, {}
    build_txPos(indir, txPos)
    build_txFASTA(indir, txFASTA)

    # Start reading "outprefix_raw.txt"
    sep = re.compile("[;|]+")  # Set separators
    homologous_pairs = {}
    fi = open(outprefix + "_raw.txt", 'r')
    fusions = []  # Fusion class list
    while 1:
        line = fi.readline()
        if not line: break
        fields = line.rstrip().split("\t")
        hNames, tNames = [], []
        for exon in re.split(sep, fields[3]):
            hNames.append(exon.split('.')[0])
        for exon in re.split(sep, fields[4]):
            tNames.append(exon.split('.')[0])

        # Sequence Homology Detection by bl2seq
        is_homologous = check_homology(hNames, tNames, bl2seqPATH,
                                       homologous_pairs, txFASTA, outprefix)

        # Estimate genes' distance
        minDist = estimate_dist(hNames, tNames, txPos)
        if minDist != "NA":
            if minDist < 0: continue  # Overlapping genes

        # Update Fusion list
        if not is_homologous:  # Putative gene fusion
            donor_acceptor = fields[3].split('.')[1] + "\t" + fields[4].split(
                '.')[1]
            if fields[-1] == "read2" or fields[-1] == "tail":
                donor_acceptor = fields[4].split(
                    '.')[1] + "\t" + fields[3].split('.')[1]
            type, ctext = fields[0][:1], fields[0][1:]
            does_exist = False
            for fusion in fusions:
                if donor_acceptor == fusion.name:
                    does_exist = True
                    if type == 'p': fusion.nPairs += 1
                    else: fusion.nSpans += 1
                    if minDist != "NA":
                        if fusion.minDist == "NA":
                            fusion.minDist = str(minDist)
                        elif minDist < int(fusion.minDist):
                            fusion.minDist = str(minDist)
                    fusion.evidences.append(line.rstrip())
            if not does_exist:
                fusion = Fusion(donor_acceptor, ctext)
                if type == 'p': fusion.nPairs += 1
                else: fusion.nSpans += 1
                fusion.minDist = str(minDist)
                fusion.evidences.append(line.rstrip())
                fusions.append(fusion)

    os.system("rm %s_temp1.fasta %s_temp2.fasta %s_temp.bl2seqout" %
              (outprefix, outprefix, outprefix))
    fi.close()
    homologous_pairs.clear()
    txFASTA.clear()
    txPos.clear()

    #---------------------------------------------------
    # Generating fusion gene candidates is terminated.
    # Further filtering cascade will be applied.
    #---------------------------------------------------

    print str(datetime.now()
              ) + "\tApplying filtering steps & generating output files..."
    fusionNum = 0
    fo_list = open(outprefix + "_fusionList.txt", 'w')
    fo_list.write("ID\tdonor\tacceptor\tcontext\tdist\tnum_pair\tnum_span\n")
    fo_evidence = open(outprefix + "_fusionEvidence.txt", 'w')
    fo_evidence.write(
        "ID\tevidence_type\tdonor_pos\tacceptor_pos\tdonor_exon\tacceptor_exon\n"
    )
    for fusion in fusions:
        if fusion.nPairs < min_pair or fusion.nSpans < min_span: continue
        spans = []
        for evidence in fusion.evidences:
            if evidence.split("\t")[0][:1] == 's': spans.append(evidence)
        spanClusters = cov_filter(spans, min_cov)
        for cluster in spanClusters:
            if len(cluster) < min_span: continue
            shift_pass, fusionNum = shift_filter(cluster, min_shift,
                                                 fo_evidence, fusionNum)
            if not shift_pass: continue
            for evidence in fusion.evidences:
                evidence_fields = evidence.split("\t")
                if evidence_fields[0][:1] == 'p':
                    donor = evidence_fields[-1]
                    if donor == "read1":
                        fo_evidence.write(
                            "GF%d\tread-pair\t%s\t%s\t%s\t%s\n" %
                            (fusionNum, evidence_fields[1], evidence_fields[2],
                             evidence_fields[3], evidence_fields[4]))
                    else:
                        fo_evidence.write(
                            "GF%d\tread-pair\t%s\t%s\t%s\t%s\n" %
                            (fusionNum, evidence_fields[2], evidence_fields[1],
                             evidence_fields[4], evidence_fields[3]))
            fo_list.write("GF%d\t%s\t%s\t%s\t%d\t%d\n" %
                          (fusionNum, fusion.name, fusion.ctext,
                           fusion.minDist, fusion.nPairs, len(cluster)))
    fo_evidence.close()
    fo_list.close()
    print str(datetime.now()) + "\tGFP is successfully terminated."
示例#11
0
    def test_mysqlannot(self):
        'Test building an AnnotationDB from MySQL'
        from pygr import seqdb, cnestedlist, sqlgraph
        dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
        # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
        exon_slices = sqlgraph.SQLTableClustered(
            '%s.pygr_refGene_exonAnnot%s_dm2' % (testInputDB,
                                                 smallSamplePostfix),
            clusterKey='chromosome', maxCache=0)
        exon_db = seqdb.AnnotationDB(exon_slices, dm2,
                                     sliceAttrDict=dict(id='chromosome',
                                                        gene_id='name',
                                                        exon_id='exon_id'))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'refGene_exonAnnot_SQL_dm2'), 'w',
                                pairwiseMode=True, bidirectional=False)
        for id in exon_db:
            msa.addAnnotation(exon_db[id])
        exon_db.clear_cache() # not really necessary; cache should autoGC
        exon_slices.clear_cache()
        msa.build()
        exon_db.__doc__ = 'SQL Exon Annotation Database for dm2'
        pygr.Data.addResource('TEST.Annotation.SQL.dm2.exons', exon_db)
        msa.__doc__ = 'SQL NLMSA Exon for dm2'
        pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa)
        exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db,
                                                   bindAttrs=('exon2', ))
        exon_schema.__doc__ = 'SQL Exon Schema for dm2'
        pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema)
        # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
        splice_slices = sqlgraph.SQLTableClustered(
            '%s.pygr_refGene_spliceAnnot%s_dm2' % (testInputDB,
                                                   smallSamplePostfix),
            clusterKey='chromosome', maxCache=0)
        splice_db = seqdb.AnnotationDB(splice_slices, dm2,
                                       sliceAttrDict=dict(id='chromosome',
                                                          gene_id='name',
                                                        splice_id='splice_id'))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'refGene_spliceAnnot_SQL_dm2'),
                                'w', pairwiseMode=True, bidirectional=False)
        for id in splice_db:
            msa.addAnnotation(splice_db[id])
        splice_db.clear_cache() # not really necessary; cache should autoGC
        splice_slices.clear_cache()
        msa.build()
        splice_db.__doc__ = 'SQL Splice Annotation Database for dm2'
        pygr.Data.addResource('TEST.Annotation.SQL.dm2.splices', splice_db)
        msa.__doc__ = 'SQL NLMSA Splice for dm2'
        pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa)
        splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db,
                                                     bindAttrs=('splice2', ))
        splice_schema.__doc__ = 'SQL Splice Schema for dm2'
        pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices',
                            splice_schema)
        # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC:
        # MYSQL VERSION
        ucsc_slices = sqlgraph.SQLTableClustered(
            '%s.pygr_phastConsElements15way%s_dm2' % (testInputDB,
                                                      smallSamplePostfix),
            clusterKey='chromosome', maxCache=0)
        ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2,
                                     sliceAttrDict=dict(id='chromosome',
                                                        gene_id='name',
                                                        ucsc_id='ucsc_id'))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'phastConsElements15way_SQL_dm2'),
                                'w', pairwiseMode=True, bidirectional=False)
        for id in ucsc_db:
            msa.addAnnotation(ucsc_db[id])
        ucsc_db.clear_cache() # not really necessary; cache should autoGC
        ucsc_slices.clear_cache()
        msa.build()
        ucsc_db.__doc__ = 'SQL Most Conserved Elements for dm2'
        pygr.Data.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved',
                              ucsc_db)
        msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for dm2'
        pygr.Data.addResource(
            'TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa)
        ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db,
                                                   bindAttrs=('element2', ))
        ucsc_schema.__doc__ = \
                'SQL Schema for UCSC Most Conserved Elements for dm2'
        pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved',
                            ucsc_schema)
        pygr.Data.save()
        pygr.Data.clear_cache()

        # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
        dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
        exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons')
        splicemsa = \
                pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices')
        conservedmsa = \
      pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved')
        exons = pygr.Data.getResource('TEST.Annotation.SQL.dm2.exons')
        splices = pygr.Data.getResource('TEST.Annotation.SQL.dm2.splices')
        mostconserved = \
            pygr.Data.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved')

        # OPEN DM2_MULTIZ15WAY NLMSA
        msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r',
                                trypath=[seqDir])

        exonAnnotFileName = os.path.join(testInputDir,
                                  'Annotation_ConservedElement_Exons%s_dm2.txt'
                                         % smallSamplePostfix)
        intronAnnotFileName = os.path.join(testInputDir,
                                'Annotation_ConservedElement_Introns%s_dm2.txt'
                                           % smallSamplePostfix)
        newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
        newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
        tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
        tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)

        if smallSampleKey:
            chrList = [smallSampleKey]
        else:
            chrList = dm2.seqLenDict.keys()
            chrList.sort()

        outfile = open(newexonAnnotFileName, 'w')
        for chrid in chrList:
            slice = dm2[chrid]
            try:
                ex1 = exonmsa[slice]
            except KeyError:
                continue
            else:
                exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
                exlist1.sort()
                for ixx, exon in exlist1:
                    saveList = []
                    tmp = exon.sequence
                    tmpexon = exons[exon.exon_id]
                    tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
                    wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \
                            tmpslice.start, tmpslice.stop
                    try:
                        out1 = conservedmsa[tmp]
                    except KeyError:
                        pass
                    else:
                        elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
                        elementlist.sort()
                        for iyy, element in elementlist:
                            if element.stop - element.start < 100:
                                continue
                            score = int(string.split(element.gene_id, '=')[1])
                            if score < 100:
                                continue
                            tmp2 = element.sequence
                            tmpelement = mostconserved[element.ucsc_id]
                            # FOR REAL ELEMENT COORDINATE
                            tmpslice2 = tmpelement.sequence
                            wlist2 = wlist1 + (tmpelement.ucsc_id,
                                               tmpelement.gene_id,
                                               tmpslice2.start, tmpslice2.stop)
                            slicestart, sliceend = max(tmp.start, tmp2.start),\
                                    min(tmp.stop, tmp2.stop)
                            tmp1 = msa.seqDict['dm2.' + chrid][slicestart:
                                                               sliceend]
                            edges = msa[tmp1].edges()
                            for src, dest, e in edges:
                                if src.stop - src.start < 100:
                                    continue
                                palign, pident = e.pAligned(), e.pIdentity()
                                if palign < 0.8 or pident < 0.8:
                                    continue
                                palign, pident = '%.2f' % palign, \
                                        '%.2f' % pident
                                wlist3 = wlist2 + ((~msa.seqDict)[src],
                                                   str(src), src.start,
                                                   src.stop,
                                                   (~msa.seqDict)[dest],
                                                   str(dest), dest.start,
                                                   dest.stop, palign, pident)
                                saveList.append('\t'.join(map(str, wlist3))
                                                + '\n')
                        saveList.sort()
                        for saveline in saveList:
                            outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpexonAnnotFileName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(newexonAnnotFileName, 'r').read())
        assert md5old.digest() == md5new.digest()

        outfile = open(newintronAnnotFileName, 'w')
        for chrid in chrList:
            slice = dm2[chrid]
            try:
                sp1 = splicemsa[slice]
            except:
                continue
            else:
                splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
                splist1.sort()
                for ixx, splice in splist1:
                    saveList = []
                    tmp = splice.sequence
                    tmpsplice = splices[splice.splice_id]
                    tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
                    wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \
                            tmpsplice.gene_id, tmpslice.start, tmpslice.stop
                    try:
                        out1 = conservedmsa[tmp]
                    except KeyError:
                        pass
                    else:
                        elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
                        elementlist.sort()
                        for iyy, element in elementlist:
                            if element.stop - element.start < 100:
                                continue
                            score = int(string.split(element.gene_id, '=')[1])
                            if score < 100:
                                continue
                            tmp2 = element.sequence
                            tmpelement = mostconserved[element.ucsc_id]
                            # FOR REAL ELEMENT COORDINATE
                            tmpslice2 = tmpelement.sequence
                            wlist2 = wlist1 + (tmpelement.ucsc_id,
                                               tmpelement.gene_id,
                                               tmpslice2.start, tmpslice2.stop)
                            slicestart, sliceend = max(tmp.start, tmp2.start),\
                                    min(tmp.stop, tmp2.stop)
                            tmp1 = msa.seqDict['dm2.' + chrid][slicestart:
                                                               sliceend]
                            edges = msa[tmp1].edges()
                            for src, dest, e in edges:
                                if src.stop - src.start < 100:
                                    continue
                                palign, pident = e.pAligned(), e.pIdentity()
                                if palign < 0.8 or pident < 0.8:
                                    continue
                                palign, pident = '%.2f' % palign, \
                                        '%.2f' % pident
                                wlist3 = wlist2 + ((~msa.seqDict)[src],
                                                   str(src), src.start,
                                                   src.stop,
                                                   (~msa.seqDict)[dest],
                                                   str(dest), dest.start,
                                                   dest.stop, palign, pident)
                                saveList.append('\t'.join(map(str, wlist3))
                                                + '\n')
                        saveList.sort()
                        for saveline in saveList:
                            outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpintronAnnotFileName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(newintronAnnotFileName, 'r').read())
        assert md5old.digest() == md5new.digest()
示例#12
0
    def test_collectionannot(self):
        'Test building an AnnotationDB from file'
        from pygr import seqdb, cnestedlist, sqlgraph
        dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
        # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
        exon_slices = Collection(
            filename=os.path.join(self.path, 'refGene_exonAnnot_dm2.cdb'),
            intKeys=True, mode='cr', writeback=False)
        exon_db = seqdb.AnnotationDB(exon_slices, dm2,
                                     sliceAttrDict=dict(id=0, exon_id=1,
                                                        orientation=2,
                                                        gene_id=3, start=4,
                                                        stop=5))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'refGene_exonAnnot_dm2'), 'w',
                                pairwiseMode=True, bidirectional=False)
        for lines in open(os.path.join(testInputDir,
                                       'refGene_exonAnnot%s_dm2.txt'
                                       % smallSamplePostfix),
                          'r').xreadlines():
            row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
            row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
            exon_slices[row[1]] = row
            exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
            msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING
        exon_db.clear_cache() # not really necessary; cache should autoGC
        # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
        exon_slices.close()
        msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
        exon_db.__doc__ = 'Exon Annotation Database for dm2'
        pygr.Data.addResource('TEST.Annotation.dm2.exons', exon_db)
        msa.__doc__ = 'NLMSA Exon for dm2'
        pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.exons', msa)
        exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db,
                                                   bindAttrs=('exon1', ))
        exon_schema.__doc__ = 'Exon Schema for dm2'
        pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema)
        # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
        splice_slices = Collection(
            filename=os.path.join(self.path, 'refGene_spliceAnnot_dm2.cdb'),
            intKeys=True, mode='cr', writeback=False)
        splice_db = seqdb.AnnotationDB(splice_slices, dm2,
                                       sliceAttrDict=dict(id=0, splice_id=1,
                                                          orientation=2,
                                                          gene_id=3, start=4,
                                                          stop=5))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'refGene_spliceAnnot_dm2'), 'w',
                                pairwiseMode=True, bidirectional=False)
        for lines in open(os.path.join(testInputDir,
                                       'refGene_spliceAnnot%s_dm2.txt'
                                       % smallSamplePostfix),
                          'r').xreadlines():
            row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
            row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
            splice_slices[row[1]] = row
            # GET THE ANNOTATION OBJECT FOR THIS EXON
            splice = splice_db[row[1]]
            msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING
        splice_db.clear_cache() # not really necessary; cache should autoGC
        # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
        splice_slices.close()
        msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
        splice_db.__doc__ = 'Splice Annotation Database for dm2'
        pygr.Data.addResource('TEST.Annotation.dm2.splices', splice_db)
        msa.__doc__ = 'NLMSA Splice for dm2'
        pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.splices', msa)
        splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db,
                                                     bindAttrs=('splice1', ))
        splice_schema.__doc__ = 'Splice Schema for dm2'
        pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema)
        # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
        ucsc_slices = Collection(
            filename=os.path.join(self.path, 'phastConsElements15way_dm2.cdb'),
            intKeys=True, mode='cr', writeback=False)
        ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2,
                                     sliceAttrDict=dict(id=0, ucsc_id=1,
                                                        orientation=2,
                                                        gene_id=3, start=4,
                                                        stop=5))
        msa = cnestedlist.NLMSA(os.path.join(self.path,
                                             'phastConsElements15way_dm2'),
                                'w', pairwiseMode=True, bidirectional=False)
        for lines in open(os.path.join(testInputDir,
                                       'phastConsElements15way%s_dm2.txt'
                                       % smallSamplePostfix),
                          'r').xreadlines():
            row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
            row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
            ucsc_slices[row[1]] = row
            ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
            msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING
        ucsc_db.clear_cache() # not really necessary; cache should autoGC
        # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
        ucsc_slices.close()
        msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
        ucsc_db.__doc__ = 'Most Conserved Elements for dm2'
        pygr.Data.addResource('TEST.Annotation.UCSC.dm2.mostconserved',
                              ucsc_db)
        msa.__doc__ = 'NLMSA for Most Conserved Elements for dm2'
        pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved',
                              msa)
        ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db,
                                                   bindAttrs=('element1', ))
        ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for dm2'
        pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved',
                            ucsc_schema)
        pygr.Data.save()
        pygr.Data.clear_cache() # force resources to reload when requested

        # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
        dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
        exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.exons')
        splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.splices')
        conservedmsa = \
          pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved')
        exons = pygr.Data.getResource('TEST.Annotation.dm2.exons')
        splices = pygr.Data.getResource('TEST.Annotation.dm2.splices')
        mostconserved = \
                pygr.Data.getResource('TEST.Annotation.UCSC.dm2.mostconserved')

        # OPEN DM2_MULTIZ15WAY NLMSA
        msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r',
                                trypath=[seqDir])

        exonAnnotFileName = os.path.join(testInputDir,
                                  'Annotation_ConservedElement_Exons%s_dm2.txt'
                                         % smallSamplePostfix)
        intronAnnotFileName = os.path.join(testInputDir,
                                'Annotation_ConservedElement_Introns%s_dm2.txt'
                                           % smallSamplePostfix)
        newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
        newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
        tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
        tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)

        if smallSampleKey:
            chrList = [smallSampleKey]
        else:
            chrList = dm2.seqLenDict.keys()
            chrList.sort()

        outfile = open(newexonAnnotFileName, 'w')
        for chrid in chrList:
            slice = dm2[chrid]
            try:
                ex1 = exonmsa[slice]
            except KeyError:
                continue
            else:
                exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
                exlist1.sort()
                for ixx, exon in exlist1:
                    saveList = []
                    tmp = exon.sequence
                    tmpexon = exons[exon.exon_id]
                    tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
                    wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \
                            tmpslice.start, tmpslice.stop
                    try:
                        out1 = conservedmsa[tmp]
                    except KeyError:
                        pass
                    else:
                        elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
                        elementlist.sort()
                        for iyy, element in elementlist:
                            if element.stop - element.start < 100:
                                continue
                            score = int(string.split(element.gene_id, '=')[1])
                            if score < 100:
                                continue
                            tmp2 = element.sequence
                            tmpelement = mostconserved[element.ucsc_id]
                            # FOR REAL ELEMENT COORDINATE
                            tmpslice2 = tmpelement.sequence
                            wlist2 = wlist1 + (tmpelement.ucsc_id,
                                               tmpelement.gene_id,
                                               tmpslice2.start, tmpslice2.stop)
                            slicestart, sliceend = max(tmp.start, tmp2.start),\
                                    min(tmp.stop, tmp2.stop)
                            tmp1 = msa.seqDict['dm2.' + chrid][slicestart:
                                                               sliceend]
                            edges = msa[tmp1].edges()
                            for src, dest, e in edges:
                                if src.stop - src.start < 100:
                                    continue
                                palign, pident = e.pAligned(), e.pIdentity()
                                if palign < 0.8 or pident < 0.8:
                                    continue
                                palign, pident = '%.2f' % palign, \
                                        '%.2f' % pident
                                wlist3 = wlist2 + ((~msa.seqDict)[src],
                                                   str(src), src.start,
                                                   src.stop,
                                                   (~msa.seqDict)[dest],
                                                   str(dest), dest.start,
                                                   dest.stop, palign, pident)
                                saveList.append('\t'.join(map(str, wlist3))
                                                + '\n')
                        saveList.sort()
                        for saveline in saveList:
                            outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpexonAnnotFileName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(newexonAnnotFileName, 'r').read())
        assert md5old.digest() == md5new.digest()

        outfile = open(newintronAnnotFileName, 'w')
        for chrid in chrList:
            slice = dm2[chrid]
            try:
                sp1 = splicemsa[slice]
            except:
                continue
            else:
                splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
                splist1.sort()
                for ixx, splice in splist1:
                    saveList = []
                    tmp = splice.sequence
                    tmpsplice = splices[splice.splice_id]
                    tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
                    wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \
                            tmpsplice.gene_id, tmpslice.start, tmpslice.stop
                    try:
                        out1 = conservedmsa[tmp]
                    except KeyError:
                        pass
                    else:
                        elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
                        elementlist.sort()
                        for iyy, element in elementlist:
                            if element.stop - element.start < 100:
                                continue
                            score = int(string.split(element.gene_id, '=')[1])
                            if score < 100:
                                continue
                            tmp2 = element.sequence
                            tmpelement = mostconserved[element.ucsc_id]
                            # FOR REAL ELEMENT COORDINATE
                            tmpslice2 = tmpelement.sequence
                            wlist2 = wlist1 + (tmpelement.ucsc_id,
                                               tmpelement.gene_id,
                                               tmpslice2.start, tmpslice2.stop)
                            slicestart, sliceend = max(tmp.start, tmp2.start),\
                                    min(tmp.stop, tmp2.stop)
                            tmp1 = msa.seqDict['dm2.' + chrid][slicestart:
                                                               sliceend]
                            edges = msa[tmp1].edges()
                            for src, dest, e in edges:
                                if src.stop - src.start < 100:
                                    continue
                                palign, pident = e.pAligned(), e.pIdentity()
                                if palign < 0.8 or pident < 0.8:
                                    continue
                                palign, pident = '%.2f' % palign, \
                                        '%.2f' % pident
                                wlist3 = wlist2 + ((~msa.seqDict)[src],
                                                   str(src), src.start,
                                                   src.stop,
                                                   (~msa.seqDict)[dest],
                                                   str(dest), dest.start,
                                                   dest.stop, palign, pident)
                                saveList.append('\t'.join(map(str, wlist3))
                                                + '\n')
                        saveList.sort()
                        for saveline in saveList:
                            outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpintronAnnotFileName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(newintronAnnotFileName, 'r').read())
        assert md5old.digest() == md5new.digest()
示例#13
0
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
    Creates a fasta-file of resulting genes and a gene to genome alignment.
    
    """
    out_fasta, out_db, out_msa = out_files
    startCol = 1
    msa = cnestedlist.NLMSA(out_msa, mode='w', pairwiseMode=True)
    genome = get_genome(None, None, touch_file=False)
    for chrom in genome.values():
        msa += chrom
    outfile = open(out_fasta, 'w')
    gene_db = {}
    for i, line in enumerate(open(in_genes)):
        print i
        # parse
        fields = line.split('\t')
        name, chrom, strand = fields[startCol: startCol + 3]
        (txStart, txEnd, cdsStart,
                            cdsEnd) = map(int, fields[startCol+3:startCol+7])
        exons = zip(map(int, fields[startCol+8][:-1].split(',')),
                    map(int, fields[startCol+9][:-1].split(',')))
        name2 = fields[startCol + 11]
        noncoding = name.startswith('NR_') or cdsStart < 0 or cdsEnd < 0
        if 'hap' in chrom:
            continue
        
        # create a record for the gene
        seq_id = '%s_%s_%s' % (i, name, name2)
        if noncoding or len(exons) == 0:
            # add entire tx region
            region = genome[chrom][txStart:txEnd]
            sequence = seqdb.Sequence(str(region), seq_id)
            msa[region] += sequence
        else:
            # make the sequence by splicing parts
            seq = ''
            if txStart < cdsStart:
                seq += str(genome[chrom][txStart:cdsStart])
                
            seq += ''.join(str(genome[chrom][e_start:e_end])
                           for e_start, e_end in exons if e_start < e_end)
            if exons[-1][1] < txEnd:
                seq += str(genome[chrom][exons[-1][1]:txEnd])
            sequence = seqdb.Sequence(seq, seq_id)
            # save the sequence to a fasta file
            outfile.write('>%s\n%s\n' % (seq_id, str(sequence)))
            # make the alignment back to genomic coords
            p_start = 0
            if txStart < cdsStart:
                region = genome[chrom][txStart:cdsStart]
                msa[region] += sequence[p_start:p_start + len(region)]
                p_start = len(region)
            for e_index, (e_start, e_end) in enumerate(exons):
                if e_start < e_end:
                    region = genome[chrom][e_start:e_end]
                    msa[region] += sequence[p_start:p_start + len(region)]
                    p_start += len(region)
            if exons[-1][1] < txEnd:
                print exons[-1], txEnd
                region = genome[chrom][exons[-1][1]:txEnd]
                msa[region] += sequence[p_start:p_start + len(region)]
                p_start += len(region)
        gene_db[seq_id] = sequence
    msa.build(saveSeqDict=True)
    outfile.close()
    pickle.dump(gene_db, open(out_db, 'w'))