def test_correlation(self): numpy.set_printoptions(precision=3, suppress=True) # Create 2 vectors of scores, zero everywhere except a random position N = 10 x = numpy.zeros(N) y = numpy.zeros(N) xpeak = numpy.random.randint(0, N) ypeak = numpy.random.randint(0, N) x[xpeak] = 10 y[ypeak] = 10 x = (x - numpy.mean(x)) / numpy.std(x) y = (y - numpy.mean(y)) / numpy.std(y) # Make tracks out of them and compute cross-correlation with our own function X = [('chr', k, k + 1, s) for k, s in enumerate(x)] Y = [('chr', k, k + 1, s) for k, s in enumerate(y)] X = fstream(iter(X), fields=['chr', 'start', 'end', 'score']) Y = fstream(iter(Y), fields=['chr', 'start', 'end', 'score']) corr = correlation([X, Y], regions=(0, N)) #, limits=[-N+1,N-1]) # Compute cross-correlation "by hand" and using numpy.correlate(mode='valid') raw = [] np_corr_valid = [] for k in range(N): """ X |- - - - -| k=0 Y <- |- - - - -| up to X |- - - - -| k=4 Y |- - - - -| """ raw.append(numpy.dot(x[-k - 1:], y[:k + 1]) / N) np_corr_valid.extend( numpy.correlate(x[-k - 1:], y[:k + 1], mode='valid')) for k in range(N - 1, 0, -1): """ X |- - - - -| k=4 Y <- |- - - - -| up to X |- - - - -| k=1 Y |- - - - -| """ raw.append(numpy.dot(x[:k], y[-k:]) / N) np_corr_valid.extend(numpy.correlate(x[:k], y[-k:], mode='valid')) # Compute cross-correlation using numpy.correlate(mode='full') np_corr_full = numpy.correlate(x, y, mode="full")[::-1] / N np_corr_valid = numpy.asarray(np_corr_valid) / N # Test if all methods yield the same result assert_almost_equal(corr, numpy.asarray(raw)) assert_almost_equal(corr, np_corr_full) assert_almost_equal(corr, np_corr_valid) # Test if the lag between the two tracks is correcty detected self.assertEqual(numpy.argmax(corr) - (N - 1), ypeak - xpeak)
def test_correlation(self): numpy.set_printoptions(precision=3,suppress=True) # Create 2 vectors of scores, zero everywhere except a random position N = 10 x = numpy.zeros(N) y = numpy.zeros(N) xpeak = numpy.random.randint(0,N) ypeak = numpy.random.randint(0,N) x[xpeak] = 10 y[ypeak] = 10 x = (x-numpy.mean(x))/numpy.std(x) y = (y-numpy.mean(y))/numpy.std(y) # Make tracks out of them and compute cross-correlation with our own function X = [('chr',k,k+1,s) for k,s in enumerate(x)] Y = [('chr',k,k+1,s) for k,s in enumerate(y)] X = fstream(iter(X),fields=['chr','start','end','score']) Y = fstream(iter(Y),fields=['chr','start','end','score']) corr = correlation([X,Y], regions=(0,N))#, limits=[-N+1,N-1]) # Compute cross-correlation "by hand" and using numpy.correlate(mode='valid') raw = [] np_corr_valid = [] for k in range(N): """ X |- - - - -| k=0 Y <- |- - - - -| up to X |- - - - -| k=4 Y |- - - - -| """ raw.append(numpy.dot(x[-k-1:],y[:k+1]) / N) np_corr_valid.extend(numpy.correlate(x[-k-1:],y[:k+1],mode='valid')) for k in range(N-1,0,-1): """ X |- - - - -| k=4 Y <- |- - - - -| up to X |- - - - -| k=1 Y |- - - - -| """ raw.append(numpy.dot(x[:k],y[-k:]) / N) np_corr_valid.extend(numpy.correlate(x[:k],y[-k:],mode='valid')) # Compute cross-correlation using numpy.correlate(mode='full') np_corr_full = numpy.correlate(x,y,mode="full")[::-1] / N np_corr_valid = numpy.asarray(np_corr_valid) / N # Test if all methods yield the same result assert_almost_equal(corr, numpy.asarray(raw)) assert_almost_equal(corr, np_corr_full) assert_almost_equal(corr, np_corr_valid) # Test if the lag between the two tracks is correcty detected self.assertEqual(numpy.argmax(corr)-(N-1), ypeak-xpeak)
def merge(*args,**kw): if not(kw['forward'] and os.path.exists(kw['forward'])): raise Usage("Specify a valid forward strand density file with -f.") if not(kw['reverse'] and os.path.exists(kw['reverse'])): raise Usage("Specify a valid reverse strand density file with -r.") if not(kw['output']): raise Usage("Specify the output file name.") def _shift(stream,shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart,iend) i2 = max(istart,iend) def _apply_shift(x): return x[:i1]+(x[i1]+shift,)+x[i1+1:i2]+(x[i2]+shift,)+x[i2+1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) fields = ['chr','start','end','score'] chrmeta = _get_chrmeta(**kw) tfwd = track.track(kw['forward'],format=kw['formatf'],chrmeta=chrmeta) trev = track.track(kw['reverse'],format=kw['formatr'],chrmeta=chrmeta) if tfwd.chrmeta: chrmeta = tfwd.chrmeta elif trev.chrmeta: chrmeta = trev.chrmeta else: raise Usage("Specify an assembly with -a.") shiftval = int(kw['shift']) if shiftval < 0: slim = 300 chrsize,chrom = sorted([(v['length'],k) for k,v in chrmeta.iteritems()],reverse=True)[0] xcor = correlation([tfwd.read(chrom),trev.read(chrom)], (1,chrsize),limits=(-slim,slim)) shiftval = (xcor.argmax()-slim-1)/2 print "Autocorrelation shift=%i, correlation is %f." %(shiftval,xcor.max()) tout = track.track(kw['output'],fields=fields, chrmeta=chrmeta,info={'datatype':'quantitative'}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(chrom), shiftval), _shift(trev.read(chrom),-shiftval)], method=method), chrom=chrom,mode=mode,clip=True) mode = 'append' tout.close() trev.close() tfwd.close() return 0
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) outfields = [f for f in tfwd.fields if f in trev.fields] tout = track(output, chrmeta=chrmeta, fields=outfields, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) tout = track(output, chrmeta=chrmeta, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti', {}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = { 'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti', {}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [ x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom)) ] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def merge(*args, **kw): if not (kw['forward'] and os.path.exists(kw['forward'])): raise Usage("Specify a valid forward strand density file with -f.") if not (kw['reverse'] and os.path.exists(kw['reverse'])): raise Usage("Specify a valid reverse strand density file with -r.") if not (kw['output']): raise Usage("Specify the output file name.") def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift, ) + x[i1 + 1:i2] + ( x[i2] + shift, ) + x[i2 + 1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) fields = ['chr', 'start', 'end', 'score'] chrmeta = _get_chrmeta(**kw) tfwd = track.track(kw['forward'], format=kw['formatf'], chrmeta=chrmeta) trev = track.track(kw['reverse'], format=kw['formatr'], chrmeta=chrmeta) if tfwd.chrmeta: chrmeta = tfwd.chrmeta elif trev.chrmeta: chrmeta = trev.chrmeta else: raise Usage("Specify an assembly with -a.") shiftval = int(kw['shift']) if shiftval < 0: slim = 300 chrsize, chrom = sorted([(v['length'], k) for k, v in chrmeta.iteritems()], reverse=True)[0] xcor = correlation( [tfwd.read(chrom), trev.read(chrom)], (1, chrsize), limits=(-slim, slim)) shiftval = (xcor.argmax() - slim - 1) / 2 print "Autocorrelation shift=%i, correlation is %f." % (shiftval, xcor.max()) tout = track.track(kw['output'], fields=fields, chrmeta=chrmeta, info={'datatype': 'quantitative'}) mode = 'write' method = kw.get("method", "mean") for chrom in chrmeta.keys(): tout.write(merge_scores([ _shift(tfwd.read(chrom), shiftval), _shift(trev.read(chrom), -shiftval) ], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() return 0
def __call__(self, **kw): feature_type = int(kw.get("feature_type") or 0) individual = kw.get("individual", False) if isinstance(individual, basestring): individual = individual.lower() in ["1", "true", "t", "on"] if individual and int(kw["mode"]) != 1: raise ValueError("Only correlation plots can work with the 'individual' option.") assembly_id = kw.get("assembly") or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") # signals = kw.get('SigMulti',{}).get('signals', []) signals = kw.get("signals", []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: # bodies features = genes elif feature_type == 1: # promoters prom_pars = { "before_start": int(kw.get("upstream") or prom_up_def), "after_start": int(kw.get("downstream") or prom_down_def), "on_strand": True, } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: # exons features = exons elif feature_type == 3: # custom track _t = track(kw.get("features"), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) # highlights = kw.get('HiMulti',{}).get('highlights', []) highlights = kw.get("highlights", []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname="plot_pairs.pdf") narr = None set_index = [] set_labels = [] _new = True if int(kw["mode"]) == 1: # correl cormax = int(kw.get("cormax") or _cormax) xarr = array(range(-cormax, cormax + 1)) _f = ["chr", "start", "end", "score"] features = [x[:3] for chrom in chrmeta for x in sorted_stream(features(chrom))] table = self.temporary_path(fname="table.txt") with open(table, "w") as t: t.write("\t".join(["chr", "start", "end", "max(correlation)", "lag_max"]) + "\n") if individual: for nplot, feature in enumerate(features): if narr is not None and nplot < _MAX_PLOTS_: pairs(narr, xarr, labels=snames, output=pdf, new=_new, last=False) _new = False narr = correlation([s.read(fields=_f) for s in signals], [feature], (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join([str(x) for x in feature[:3] + (max_corr, lag_max)]) + "\n") else: narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join(["-", "-", "-"] + [str(max_corr), str(lag_max)]) + "\n") elif int(kw["mode"]) == 0: # density xarr = None for chrom in chrmeta: feat = features(chrom) if "name" not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if "name" not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw["mode"]) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels], new=_new, last=True) if int(kw["mode"]) == 1: self.new_file(table, "table") self.new_file(pdf, "plot_pairs") return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti',{}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti',{}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom))] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index,set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()