示例#1
0
def count_words_tree(sequences,
                     l,
                     searchLocation,
                     strand='+-',
                     overlap=False,
                     error=0,
                     spacing=(1, 1)):
    """Count each word of length l in sequences
    l               -- oligonucleotide length
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    return N, H
    """
    location = find_location(sequences)
    #H = {} #hash table key=oligonucleotide value=list of occurrence position
    N = {}  #scanned base count per position fo each word size
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    #
    # construct SuffixTree
    #
    st = ST.SuffixTree(maxDepth=l,
                       overlapping=overlap,
                       maxIUPAC=error,
                       NExtension=spacing,
                       storePosition=True)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)
        dna = s.get_dna((a, b + l + 1))
        st.add_dna(dna, shift=a)
        for I in range(a, b + 1):
            i = I - s.location[0]
            w = dna[i:i + l]
            if w.find('N') >= 0:
                continue
            N[l][I - searchLocation[0]] += 1

        #@DEBUG
        #ST.display(st.root, maxDepth=6, full=1)

    #
    # Count
    #

    #@DEBUG
    #keys = st.extract(minLength=l, maxLength=l).keys()
    #keys.sort()
    #print '\n'.join(keys)

    C = st.extract(minLength=l, maxLength=l)

    if strand == '+-':
        H = ST.get_positions_two_strands(C, overlap)
    else:
        H = ST.get_positions(C)

    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)