示例#1
0
文件: defs.py 项目: xguse/gusPyProj
def spawnOrthoGroups(promoterFileList,nWayOrthoList):
    """Takes promoterFileList<listOfPaths> and nWayOrthoList<listOfLists> and spawns the orthoGroup
    objects in a dictionary with keys = 'geneName1:geneName2:etc' that will be used to run the combined
    hypergeometric analysis."""
    
    
    
    # validation
    assert type(promoterFileList) == type([]), \
           '''promoterFileList must be a list of file paths.
           You provided type: "%s"'''\
           % (type(promoterFileList))
    assert type(promoterFileList[0]) == type(''), \
           '''promoterFileList must be a list of file paths.
           promoterFileList[0] != type(''): "%s"'''\
           % (type(promoterFileList[0]))
    
    # load promoters
    allPromoters = {}
    for i in range(len(promoterFileList)):
        oneGenome = Fasta.file2dict(promoterFileList[i])
        for j in oneGenome:
            allKeys = allPromoters.keys()
            assert j not in allKeys, \
                   '''Detected duplicate gene name in promoterFileList! "%s"'''\
                   % (j)
            allPromoters[j] = oneGenome[j]
    
    # Build Groups
    orthoGroups = {}
    for i in range(len(nWayOrthoList)):
        groupDict = {}
        for j in range(len(nWayOrthoList[i])):
            if allPromoters[nWayOrthoList[i][j]]:
                groupDict[nWayOrthoList[i][j]] = allPromoters[nWayOrthoList[i][j]]
            else:
                break # we do not want orthoGroups that are missing members
        
        if len(groupDict) != len(nWayOrthoList[i]):
            break # we do not want orthoGroups that are missing members
        else:
            nWayOrthoList[i].sort()
            orthoGroups[':'.join(nWayOrthoList[i])] = OrthoGroup(groupDict)
            
    return orthoGroups
示例#2
0
from TAMO.seq import Fasta
from gusPyCode.defs import bioDefs

miRNAFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa'
seedFile  = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa'

oligoType = 'control' # 'match' or 'control'
assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".'

# Load miRNA fastas into dict.
miRNAs = Fasta.file2dict(miRNAFile)

# Create new dict for seeds.
seeds = {}

# 1) Cycle through miRNA dict taking 7mers starting at pos 1 
#    and then pos2. Adapt key to reflect which. 
# 2) Convert to all uppers and convert U's to T's
# 3) If oligoType == 'match', rvcmp each 7mer and adapt key
#    to reflect which.
for miRNA in miRNAs:
    pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T')
    pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T')


    if oligoType == 'match':
        seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed)
        seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed)
    else:
        seeds[miRNA+'_ctrl_pos1'] = pos1_seed
        seeds[miRNA+'_ctrl_pos2'] = pos2_seed
示例#3
0
from TAMO.MotifTools import top_nmers,Motif
from TAMO import MotifTools
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import ifKmerInAll

seqFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas'
outFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.8mersInAll.txt'
kmerSize    = 8
scoreThresh = 0.999999

seqs = Fasta.file2dict(seqFile)



# create new dict to store the seqs' kmers
seqsKmers = {}
for i in seqs:
    seqsKmers[i] = top_nmers(kmerSize,[seqs[i]], purge_Ns = 1)   # for some reason top_nmers fails silently if given str instead of list

inAllSeqs = []
count = 0
for seq in seqsKmers:
    for kmer in seqsKmers[seq]:
        if ifKmerInAll(kmer,seqs,scoreThresh):
            if kmer not in inAllSeqs:
                inAllSeqs.append(kmer)
                count+=1
                print count


outFile = open(outFile, 'w')
示例#4
0
parser.add_option('-f', dest="make_fasta", action="store_true",default=False,
                  help="""Produce relavent fasta files too. (default=%default)""")


(opts, args) = parser.parse_args()

# --- A Little Extra Input Validation ---
if len(args) < 2:
    parser.print_help()
    print '\nERROR: Both geneListFile and fastaFile are required!'
    exit(1)



geneNames = map(lambda l: l.strip(),open(args[0], 'rU').readlines())
totalSeqs = Fasta.file2dict(args[1])
randClusterLists = genRandClusters(geneNames,totalSeqs,N=opts.N, keepLen=1)

# -- Make Out Folder --
mkdirp(opts.out_dir)
    

for i in range(len(randClusterLists)):
    oFileName = args[0].replace('.txt','randomGeneNames_%s.txt' % (i)).split('/')[-1]
    oFile = open('%s/%s' % (opts.out_dir,oFileName), 'w')
    for name in randClusterLists[i]:
        oFile.write(name+'\n')
    oFile.close()
    # --- If Asked, Create Fastas ---
    if opts.make_fasta:
        fNames  = map(lambda l: l.strip(),open('%s/%s' % (opts.out_dir,oFileName), 'rU').readlines())
示例#5
0
from TAMO.seq import Fasta
#from gusPyCode.defs.JamesDefs import revComp

fFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/Fastas/Aa_500afterCoding.usuable.stpCdn.fas'
sFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa'
oFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/SeedCountOutPut/counts/miRBaseMatureSeedsOn_Aa_500afterCoding.ctrl.txt'

print 'WARNING!!\nThis script now takes the exact k-mer to be searched!!!\nGive it the "match" or the "control" specifically.\n(match is rvcmp\'d version of miRNA seed)\nIT WILL _NOT_ REVCOMP IT FOR YOU!!!!\n'

# --------- Fasta Prep ---------
fastas    = Fasta.file2dict(fFile)
seqNames  = fastas.keys()
seqNames.sort()
# seqs are softMasked.  This unMaskes them.
for seq in fastas:
    fastas[seq] = fastas[seq].upper()

# --------- Seed Prep ---------
seeds     = Fasta.file2dict(sFile)
seedNames = seeds.keys()
seedNames.sort()
# to make sure we are only looking for uppercase strings
for seed in seeds:
    seeds[seed] = seeds[seed].upper()


results = ['#seqName\t'+'\t'.join(seedNames)]

def findSeedsInSeq(seeds,seedNames,seqStr,seqName):
    '''take dict of seeds, a seq, and its name. Return tsv string of 
    seqName followed by 0s and 1s corelating with presence