def get_svd_learn_clusters(accu_path, data=None, sing_threshold=2.0, assign_clstr=0.1, vis=False): """First runs the decomposition for maximum number of singular values. Then reruns on a subset > than some value""" (N, f) = data.shape all_components = min(N,f) U, Sigma, VT = randomized_svd(data, n_components=all_components, n_iter=5, random_state=None) # print "Sigma:", Sigma best_components = sum(Sigma > sing_threshold) U, Sigma, VT = randomized_svd(data, n_components=best_components, n_iter=5, random_state=None) pred_labels = [np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U] # print "predicted classes:", pred_labels utils.screeplot(accu_path, Sigma, all_components, vis) """Plot a graph for each right singular vector (VT)""" max_, min_ = 0, 100 min_=100 for i in VT: if max(i)>max_: max_ = max(i) if min(i)<min_: min_ = min(i) if vis: with open(accu_path + "/graphlets.p", 'r') as f: graphlets = pickle.load(f) for i, vocabulary in enumerate(VT): title = 'Latent Concept %s' % i utils.genome(accu_path, vocabulary, [min_, max_], title) if vis: for c, v in enumerate(vocabulary): if v > 0.1: print "\n",c, graphlets[c] return U, Sigma, VT
def loadhic(filename, genome='hg19', resolution=100000, usechr=['#', 'X'], verbose=False): from . import straw tgenome = alabutils.genome(genome) bininfo = tgenome.bininfo(resolution) m = contactmatrix(len(bininfo.chromList), genome=genome, resolution=resolution, usechr=usechr) for chr1 in tgenome.info['chrom']: i = tgenome.getchrnum(chr1) for chr2 in tgenome.info['chrom']: j = tgenome.getchrnum(chr2) if i > j: continue if verbose: print chr1, chr2 result = straw.straw("NONE", filename, chr1[3:], chr2[3:], 'BP', resolution) for t in range(len(result[0])): x = int(result[0][t] / resolution) + bininfo.binStart[i] y = int(result[1][t] / resolution) + bininfo.binStart[j] m.matrix[x, y] = result[2][t] m.matrix[y, x] = result[2][t] #- #-- #-- return m
def get_svd_learn_clusters(accu_path, data=None, sing_threshold=2.0, assign_clstr=0.1, vis=False): """First runs the decomposition for maximum number of singular values. Then reruns on a subset > than some value""" (N, f) = data.shape all_components = min(N, f) U, Sigma, VT = randomized_svd(data, n_components=all_components, n_iter=5, random_state=None) # print "Sigma:", Sigma best_components = sum(Sigma > sing_threshold) U, Sigma, VT = randomized_svd(data, n_components=best_components, n_iter=5, random_state=None) pred_labels = [ np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U ] # print "predicted classes:", pred_labels utils.screeplot(accu_path, Sigma, all_components, vis) """Plot a graph for each right singular vector (VT)""" max_, min_ = 0, 100 min_ = 100 for i in VT: if max(i) > max_: max_ = max(i) if min(i) < min_: min_ = min(i) if vis: with open(accu_path + "/graphlets.p", 'r') as f: graphlets = pickle.load(f) for i, vocabulary in enumerate(VT): title = 'Latent Concept %s' % i utils.genome(accu_path, vocabulary, [min_, max_], title) if vis: for c, v in enumerate(vocabulary): if v > 0.1: print "\n", c, graphlets[c] return U, Sigma, VT
def dump_lda_output(path, doc_topic, topic_word): f = open(os.path.join(path, "doc_topic.p"), "w") pickle.dump(doc_topic, f) f.close() f = open(os.path.join(path, "topic_word.p"), "w") pickle.dump(topic_word, f) f.close() """Plot a graph for each topic word distribution (vocabulary)""" max_, min_ = 0, 100 min_ = 100 for i in topic_word: if max(i) > max_: max_ = max(i) if min(i) < min_: min_ = min(i) for i, vocabulary in enumerate(topic_word): title = 'Topic %s' % i utils.genome(path, vocabulary, [min_, max_], title)
def dump_lda_output(path, doc_topic, topic_word): f = open(os.path.join(path, "doc_topic.p"), "w") pickle.dump(doc_topic, f) f.close() f = open(os.path.join(path, "topic_word.p"), "w") pickle.dump(topic_word, f) f.close() """Plot a graph for each topic word distribution (vocabulary)""" max_, min_ = 0, 100 min_=100 for i in topic_word: if max(i)>max_: max_ = max(i) if min(i)<min_: min_ = min(i) for i, vocabulary in enumerate(topic_word): title = 'Topic %s' % i utils.genome(path, vocabulary, [min_, max_], title)
def __init__(self,filename,genome=None,resolution=None,usechr=['#','X']): self._applyedMethods = {} if isinstance(filename,int): self.matrix=np.zeros((filename,filename),dtype = np.float32) elif isinstance(filename,str): if not os.path.isfile(filename): raise IOError,"File %s doesn't exist!\n" % (filename) if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext(filename)[1] == '.hmat': h5f = h5py.File(filename,'r') self.matrix = h5f['matrix'][:] self.idx = h5f['idx'][:] if 'applyedMethods' in h5f.keys(): self._applyedMethods = cPickle.loads(h5f['applyedMethods'].value) if 'genome' in h5f.keys() and 'resolution' in h5f.keys(): self.genome = cPickle.loads(h5f['genome'].value) self.resolution = cPickle.loads(h5f['resolution'].value) h5f.close() else: from alabio import loadstream f = loadstream(filename) s = f.next() line = re.split('\t+|\s+',s.rstrip()) n = len(line) - 3 idx = [] i = 0 tidx = line[0:3];tidx.append('') idx.append(tidx) self.matrix = np.zeros((n,n),dtype = np.float32) self.matrix[i] = line[3:] for s in f: i += 1 line = re.split('\t+|\s+',s.rstrip()) tidx = line[0:3];tidx.append('') idx.append(tidx) self.matrix[i] = line[3:] f.close() self.idx = np.core.records.fromarrays(np.array(idx).transpose(),dtype=self._idxdtype) else: raise RuntimeError, "Undefined input filename type!\n" #----------------end filename if isinstance(genome,str) and isinstance(resolution,int): if hasattr(self,"genome") and hasattr(self,"resolution"): raise RuntimeError, "Genome and resolution has already been specified." genomedb = alabutils.genome(genome,usechr=usechr) bininfo = genomedb.bininfo(resolution) flaglist = ['' for i in range(len(bininfo.chromList))] self.genome = genome self.resolution = resolution self._buildindex(bininfo.chromList,bininfo.startList,bininfo.endList,flaglist)
def __init__(self,probfile,nucleusRadius=5000.0,contactRange=1,level=None,record=-1): self.probmat = matrix.contactmatrix(probfile) self.nbead = len(self.probmat) #setup log LEVELS={'debug':logging.DEBUG,'info':logging.INFO,'warning':logging.WARNING,'error':logging.ERROR,'critical':logging.CRITICAL} loglevel = LEVELS.get(level,logging.NOTSET) self.logger = logging.getLogger() self.logger.setLevel(loglevel) self._log_capture_string = StringIO() chhandler = logging.StreamHandler(self._log_capture_string) chhandler.setLevel(loglevel) self.logger.addHandler(chhandler) self.logger.setLevel(loglevel) #setup record self._record_step = record if record >= 100: self.record = [] #CONST rscale = 1.38 # 20% occupancy self.nucleusRadius = nucleusRadius # nm cdensity = 107.45 # bp/nm assuming 197 bp/nucleosomes and 6 nucleosome/11 nm kscale = (0.75*15**2)**(1.0/3.0) # 3/4*r**2 where r=15nm self.contactRange = contactRange # surface to surface distance scale of (r1+r2) # for which 2 beads are considered as contact #get radius of each bead self.beadRadius = [rscale * kscale * ((index['end'] - index['start'])/cdensity) ** (1.0/3.0) for index in self.probmat.idx] #calculate the total volumn of DNA (diploid) and nucleus dnavol = sum(4. * 3.1415/3. * np.array(self.beadRadius)**3) * 2 nucvol = (4*3.1415/3)*self.nucleusRadius**3 #And chromosome occupancy dnaocc = dnavol / nucvol self.logger.debug('occupancy: %.2f with Rnuc %d'%(dnaocc,self.nucleusRadius)) #diploid Rb; 2xtotal haploid beads self.beadRadius = self.beadRadius + self.beadRadius # Chromosome territory apply self.genome = utils.genome(self.probmat.genome) cscale=1.0 chrvol = nucvol * self.genome.info['length']/sum(self.genome.info['length'])/2 self.chromRadius=cscale*((chrvol/4*3/3.1415)**(1./3.)) #record starting time self.model = IMP.Model() self.chain = IMP.container.ListSingletonContainer(self.model) self.restraints = IMP.RestraintSet(self.model) #IMP.set_check_level(IMP.USAGE) IMP.set_check_level(IMP.NONE) IMP.set_log_level(IMP.SILENT) #setup nucleus envelope self.center = IMP.algebra.Vector3D(0,0,0)
def __init__(self, filename, genome=None, resolution=None, usechr=['#', 'X']): self._applyedMethods = {} if isinstance(filename, int): self.matrix = np.zeros((filename, filename), dtype=np.float32) elif isinstance(filename, str): if not os.path.isfile(filename): raise IOError, "File %s doesn't exist!\n" % (filename) if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext( filename)[1] == '.hmat': h5f = h5py.File(filename, 'r') self.matrix = h5f['matrix'][:] self.idx = h5f['idx'][:] if 'applyedMethods' in h5f.keys(): self._applyedMethods = cPickle.loads( h5f['applyedMethods'].value) if 'genome' in h5f.keys() and 'resolution' in h5f.keys(): self.genome = cPickle.loads(h5f['genome'].value) self.resolution = cPickle.loads(h5f['resolution'].value) h5f.close() else: from alabio import loadstream f = loadstream(filename) s = f.next() line = re.split('\t+|\s+', s.rstrip()) n = len(line) - 3 expectn = n if isinstance(genome, str) and isinstance(resolution, int): genomedb = alabutils.genome(genome, usechr=usechr) bininfo = genomedb.bininfo(resolution) expectn = len(bininfo.chromList) if expectn != n: raise RuntimeError, "Dimension don't match, expected %s bins , get %s bins. Please check the input." % ( expectn, n) idx = [] i = 0 tidx = line[0:3] tidx.append('') idx.append(tidx) self.matrix = np.zeros((n, n), dtype=np.float32) self.matrix[i] = line[3:] for s in f: i += 1 line = re.split('\t+|\s+', s.rstrip()) tidx = line[0:3] tidx.append('') idx.append(tidx) self.matrix[i] = line[3:] f.close() self.idx = np.core.records.fromarrays( np.array(idx).transpose(), dtype=self._idxdtype) else: raise RuntimeError, "Undefined input filename type!\n" #----------------end filename if isinstance(genome, str) and isinstance(resolution, int): if hasattr(self, "genome") and hasattr(self, "resolution"): raise RuntimeError, "Genome and resolution has already been specified." genomedb = alabutils.genome(genome, usechr=usechr) bininfo = genomedb.bininfo(resolution) flaglist = ['' for i in range(len(bininfo.chromList))] self.genome = genome self.resolution = resolution self._buildindex(bininfo.chromList, bininfo.startList, bininfo.endList, flaglist)
def __init__(self, probfile, nucleusRadius=5000.0, chromosomeOccupancy=0.2, contactRange=1, level=None): self.probmat = alabmatrix.contactmatrix(probfile) self.nbead = len(self.probmat) #setup log LEVELS = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL } loglevel = LEVELS.get(level, logging.NOTSET) self.logger = logging.getLogger() self.logger.setLevel(loglevel) self._log_capture_string = io.StringIO() chhandler = logging.StreamHandler(self._log_capture_string) chhandler.setLevel(loglevel) self.logger.addHandler(chhandler) self.logger.setLevel(loglevel) #CONST #rscale = 1.38 # 20% occupancy self.occupancy = chromosomeOccupancy #chromosome occupancy in nucleus, defined as diploid_domain_total_volume/nuclear_volume self.nucleusRadius = nucleusRadius # nm #cdensity = 107.45 # bp/nm assuming 197 bp/nucleosomes and 6 nucleosome/11 nm #kscale = (0.75*15**2)**(1.0/3.0) # 3/4*r**2 where r=15nm self.contactRange = contactRange # surface to surface distance scale of (r1+r2) # for which 2 beads are considered as contact self.genome = alabutils.genome(self.probmat.genome) rho = self.occupancy * self.nucleusRadius**3 / ( 2 * sum(self.genome.info['length'])) #get radius of each bead self.beadRadius = [(rho * (index['end'] - index['start']))**(1.0 / 3.0) for index in self.probmat.idx] #self.beadRadius = [rscale * kscale * ((index['end'] - index['start'])/cdensity) ** (1.0/3.0) for index in self.probmat.idx] #calculate the total volumn of DNA (diploid) and nucleus dnavol = sum(4. * 3.1415 / 3. * np.array(self.beadRadius)**3) * 2 nucvol = (4 * 3.1415 / 3) * self.nucleusRadius**3 #And chromosome occupancy dnaocc = dnavol / nucvol self.logger.debug(u'Occupancy: %.2f with Rnuc %d' % (dnaocc, self.nucleusRadius)) #diploid Rb; 2xtotal haploid beads self.beadRadius = self.beadRadius + self.beadRadius # Chromosome territory apply #self.genome = alabutils.genome(self.probmat.genome) cscale = 1.0 chrvol = nucvol * self.genome.info['length'] / sum( self.genome.info['length']) / 2 self.chromRadius = cscale * ((chrvol / 4 * 3 / 3.1415)**(1. / 3.)) #record starting time self.model = IMP.Model() self.chain = IMP.container.ListSingletonContainer(self.model) self.restraints = IMP.RestraintSet(self.model) #IMP.set_check_level(IMP.USAGE) IMP.set_check_level(IMP.NONE) IMP.set_log_level(IMP.SILENT) #setup nucleus envelope self.center = IMP.algebra.Vector3D(0, 0, 0)