예제 #1
0
파일: parse.py 프로젝트: bh0085/everySNAKE
def getKMERsForName(libname = 'mouse',tissue_term = None, **kwargs):
    '''
Calls "parse" on the fasta file referred to as libname.

Optionally, specify a tissue term that will serve as a regex filter on the 
FASTA record descriptors. Specifying a term such as "brain" will do
a case insensitive search on records in the library to return only kmers
from records matching "term".
'''
    def setKMERsForName(**kwargs):
        lname = kwargs['libname']
        return parse( **kwargs)

    
    name = libname if tissue_term ==None \
        else '{0}_tissue={1}'.format(libname,tissue_term) 
    
    if kwargs.has_key('restored'):
        output = kwargs['restored']
        mem.getOrSet(setKMERsForName, 
                        **mem.rc(kwargs,
                                 libname = libname,
                                 tissue_term = tissue_term,
                                 name = name,
                                 update = output))
    else:
        return mem.getOrSet(setKMERsForName, 
                        **mem.rc(kwargs,
                                 libname = libname,
                                 tissue_term = tissue_term,
                                 name = name))
예제 #2
0
파일: genes.py 프로젝트: bh0085/everySNAKE
def getTrackChrGenes(**kwargs):
    '''
Get all of the genes from a bed file
on a given chromosome.

kwargs
num:   chromosome number 
fname: bedfile path (uses global bedfile as the default)

returns
a list of attributes for every gene.
'''
    def setTrackChrGenes(**kwargs):
        fname = kwargs.get('fname', bedfile)
        num = kwargs.get('num', 1)
        t = track.load(fname);
        chromosome_data = t.read('chr{0}'.format(num))
        rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)]
        return rows
    
    return mem.getOrSet(setTrackChrGenes,
                        **mem.rc( kwargs,
                                  onfail = 'compute',
                                  name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(bedfile)),
                                                          kwargs.get('num', 1))
                                  ))
예제 #3
0
파일: genes.py 프로젝트: bh0085/everySNAKE
def plotPeaks(num = 1):
    import cb.utils.plots as myplots

    def setHist(**kwargs):
     peaks = getPeaks()['chr{0}'.format(num)]
     proms = getTrackChrPromoters(num = num)
     
     all_hits = zeros(20)
     for k,v in proms.iteritems():
         mid =(v[0] + v[1]) / 2
         deltas = []
         for p in peaks:
             pmid = (p['start'] + p['end'])/2
             if abs(pmid - mid) < 5000:
                 deltas.append(pmid - mid)
         hits, bin_offsets = histogram(deltas, 20, [-5000,5000])
         all_hits += hits;
     return bin_offsets, all_hits;
    bin_offsets, hits = mem.getOrSet(setHist, 
                                     num = num)
    f = myplots.fignum(1)
    ax = f.add_subplot(111)
    ax.set_xlabel('distance from promoter')
    #ax.set_xticks(bin_offsets)
    #ax.set_xticklabels(['{0}'.format(e) for e in bin_offsets])
    ax.set_ylabel('counts')
    ax.plot(bin_offsets[:-1],hits)
예제 #4
0
파일: genes.py 프로젝트: bh0085/everySNAKE
def getTrackChrPromoters(**kwargs):
    '''
Get all of the forward promoter from a bed file
on a given chromosome.

kwargs
num:   chromosome number 
fname: bedfile path (uses global bedfile as the default)

returns
a list of the coordinates of each forward promoter.
'''
    def setTrackChrPromoters(**kwargs):
        fname = kwargs.get('fname', bedfile)
        num = kwargs.get('num', 1)
        t = track.load(fname);
        chromosome_data = t.read('chr{0}'.format(num))
        rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)]
        fwd_genes = [e for e in rows if e['strand'] == 1]
        fwd_starts =dict([(e['name'],e['start']) for e in fwd_genes])
        fwd_promoters= dict([(k, [v - 2000, v - 100])
                             for k,v in fwd_starts.iteritems()])
        return fwd_promoters
    
    return mem.getOrSet(setTrackChrPromoters,
                        onfail = 'compute',
                        name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(bedfile)),
                                                kwargs.get('num', 1)))
예제 #5
0
파일: genes.py 프로젝트: bh0085/everySNAKE
def mapAllGenes(**kwargs):
    def setAllGenes(**kwargs):
       allPeaks = getPeaks()
       all_results = {}

       #if you were running for a larger dataset you might want to 
       #break this loop after a single iteration and just choose a chromosome
       for num in range(1,20) + ['X']:
           print 'Parsing Chromosome: chr{0}'.format(num)
           genes_dict = {}
           all_results['chr{0}'.format(num)] = genes_dict

           #get the genes on a chromosome
           chrgenes = getTrackChrGenes(**mem.sr(kwargs, num = num))
           #get the peaks on a chromosome
           peaks = allPeaks['chr{0}'.format(num)]

           for i, g in enumerate(chrgenes):
               name = g['name']
               startpos = g['start'] if g['strand'] == 1 else g['end']
               hits = []
               
               #list features near this gene.
               for p in peaks:
                   stranded_offset =array([ g['strand'] * (p['start']  - startpos),
                                           g['strand'] * (p['end'] - startpos)])
                   if( np.min(abs(stranded_offset)) < 2000 \
                           or np.prod(stranded_offset) < 0):
                       stranded_offset.sort()
                       hits.append({'peak_info':p,
                                  'peak_stranded_offset':stranded_offset})
               
               #store some extra information in the dictionary that we'll output
               hits = sorted(hits,key = lambda x: x['peak_stranded_offset'][0])
               gene_object = {
                   'dnase_peaks':hits,
                   'name':name,
                   'gene_info':g,
                   'start':g['start'],
                   'end':g['end'],
                   'strand':g['strand']
                   }
               genes_dict[name] = gene_object

               if (mod(i,100) == 0):
                   print 'Gene {0}: {1}, {2} hits'.format(i, g['name'], len(hits))
       
       return all_results;
    return mem.getOrSet(setAllGenes, **kwargs)
예제 #6
0
파일: genes.py 프로젝트: bh0085/everySNAKE
def getPeaks():
    '''
Get all of peaks from a narrowpeak file 
on all chromosomes.

kwargs
none:

returns
a list of peaks.
'''
    def setPeaks(**kwargs):
        peaks = {}
        with open(peakfile) as pf:
            for l in pf.readlines():
                grps = l.split('\t')
                cols = ['chrom',
                    'start',
                    'end',
                    'name',
                    'score',
                    'strand',
                    'signalValue',
                    'pValue',
                    'qValue',
                    'peak']

            #note, peak is a zero based offset from start
                hit = dict(zip(cols[1:],grps[1:]))
                hit['start'] = int(hit['start'])
                hit['end'] = int(hit['end'])
                hit['peak'] = int(hit['peak'])
                
                if not peaks.has_key(grps[0]):
                    peaks[grps[0]] = []
                peaks[grps[0]].append(hit)

        return peaks
    return mem.getOrSet(setPeaks,
                        onfail = 'compute')
예제 #7
0
파일: parse.py 프로젝트: bh0085/everySNAKE
def getTranslatedForName(libname, **kwargs):
    '''Translate kMERs to a numerical array for downstream analysis.'''
    def setTranslatedForName(**kwargs):
        global k
        global translation
        libname = kwargs.get('libname')
        o = getKMERsForName( **mem.sr(kwargs, libname = libname))
        translated = zeros((len(o),k))
        idxed_mers = dict([(i,k) for i,k in enumerate(o.keys())])
        occurrences=array([ o[idxed_mers[i]] for i in range(len(translated))])
        d = translation
        for i in idxed_mers.keys():
            translated[i] = [d.get(l,4) for  l in idxed_mers[i]]
        return idxed_mers,translated, occurrences
    tissue_term = kwargs.get('tissue_term', None)
    name = libname if tissue_term ==None \
        else '{0}_tissue={1}'.format(libname,tissue_term) 

    return mem.getOrSet(setTranslatedForName, 
                        **mem.rc(kwargs,
                                 libname = libname,
                                 name = name))
예제 #8
0
def getBandCollectionAliases(**kwargs):
    def setBandCollectionAliases(name = None, **kwargs):
        assert name != None
        all_aliases = []
	freebase = discovery.build('freebase', 'v1', developerKey=DEVELOPER_KEY)
        names_key = name
        names_list = band_collectionnames[names_key]
        for n in names_list:
            q = [{
                    "name~=":"{0}".format(n),
                    "type":  "/music/musical_group",

                    "/common/topic/alias": [{
                            "value": None
                            }],
                    "/music/musical_group/member": [{
                            "member": {
                                "/common/topic/alias": [{
                                        "value": None
                                        }]
                                }
                            }],
                    }]
            responses = json.loads(freebase.mqlread(query=json.dumps(q)).execute())
            for band in responses['result']:
                member_aliases = [ a['value']  for e in band["/music/musical_group/member"] for a in e['member']["/common/topic/alias"]]
                band_aliases = [a['value'] for a in band["/common/topic/alias"] ]
                all_aliases.extend(member_aliases)
                all_aliases.extend(band_aliases)

        return all_aliases

        
    name = kwargs['name']
    return mem.getOrSet(setBandCollectionAliases, **mem.rc(kwargs,
                                                           name = name))
예제 #9
0
def getKMERsForName(libname):
    def setKMERsForName(**kwargs):
        lname = kwargs["libname"]
        return parse(libname)

    return mem.getOrSet(setKMERsForName, libname=libname, name=libname)