Пример #1
0
 def test_allkinds(self):
     format = 'pdf'
     from bbcflib.gfminer.figure import venn
     D1 = {'A':126}
     D2 = {'Albert':126, 'Barthur':247, 'Albert|Barthur':50}
     D31 = {'Ar':521, 'Bi':14, 'Co':290, 'Ar|Bi':11, 'Ar|Co':100, 'Bi|Co':4, 'Ar|Bi|Co':1}
     D32 = {'A':521, 'B':300, 'C':290, 'A|B':11, 'A|C':100, 'B|C':44, 'A|B|C':5}
     D4 = {'A':210, 'B':220, 'C':230, 'D':240,
           'A|B':80, 'A|C':80, 'A|D':80, 'B|C':80, 'B|D':80, 'C|D':80,
           'A|B|C':30, 'A|B|D':30, 'A|C|D':30, 'B|C|D':30, 'A|B|C|D':10}
     venn(D1,output=path+'d1.'+format,legend=['file1.bed'],format=format)
     venn(D2,output=path+'d2.'+format,format=format)
     venn(D31,output=path+'d3.1.'+format,format=format)
     venn(D32,output=path+'d3.2.'+format,legend=['file1','file2','file3'],format=format)
     venn(D4,output=path+'d4.'+format,legend=['file1','file2','file3','file4'],format=format)
Пример #2
0
 def __call__(self,**kw):
     files_list = kw['SigMulti']['files']
     column = int(kw['column'])-1
     output = self.temporary_path(fname='intersections.')
     counts,legend = self.compare(files_list, output, column)
     # compress
     output_targz = self.temporary_path(fname=output+'tar.gz')
     tar = tarfile.open(output_targz, 'w:gz')
     tar.add(output)
     tar.close()
     self.new_file(output+'.tar.gz', 'intersections')
     if len(files_list) <= 4:
         # Venn diagram
         venn_format = 'png'
         venn_outname = self.temporary_path(fname='venn'+venn_format)
         venn(counts,legend=None,options={},output=venn_outname,format=venn_format)
         self.new_file(venn_outname, 'venn_diagram')
     return self.display_time()
Пример #3
0
 def __call__(self, **kw):
     files_list = kw['SigMulti']['files']
     column = int(kw['column']) - 1
     output = self.temporary_path(fname='intersections.')
     counts, legend = self.compare(files_list, output, column)
     # compress
     output_targz = self.temporary_path(fname=output + 'tar.gz')
     tar = tarfile.open(output_targz, 'w:gz')
     tar.add(output)
     tar.close()
     self.new_file(output + '.tar.gz', 'intersections')
     if len(files_list) <= 4:
         # Venn diagram
         venn_format = 'png'
         venn_outname = self.temporary_path(fname='venn' + venn_format)
         venn(counts,
              legend=None,
              options={},
              output=venn_outname,
              format=venn_format)
         self.new_file(venn_outname, 'venn_diagram')
     return self.display_time()
Пример #4
0
    def __call__(self, **kw):

        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f "+s+")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint,min(sys.maxint,num))
            num = (num,)*c.count("%f")
            return eval(cond % (num))

        def _add_label(s,x):
            _f = s.fields+['track_name']
            return FeatureStream((y+(x,) for y in s), fields=_f)

        venn_options = {} # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns','')
            s_filters = kw.get('filters','')
            infile = track(kw.get('table',''),format='txt',header=True)
            col_ind = [int(i)-1 for i in s_cols.split(",")]
            legend = [infile.fields[i] if i<len(infile.fields) else str(i) for i in col_ind]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k+65) for k in range(len(col_ind))]
            conds += ["1"]*(len(col_ind)-len(conds))
            combn = [tuple(sorted(x)) for k in range(len(tlabels)) 
                     for x in combinations(tlabels,k+1)]
            c1 = dict(("|".join(c),0) for c in combn)
            c2 = dict(("|".join(c),0) for c in combn)
            indx = dict((c,[tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row,i,c) for i,c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n,t in enumerate(tests) if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            #filenames = kw['TrMulti']['files']
            filenames = kw['files']
            if not isinstance(filenames,(list,tuple)): filenames = [filenames]
            for f in filenames: assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f,chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k+65) for k in range(len(tracks))]
            combn = [combinations(tlabels,k+1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn,[0]*len(combn)))
            c2 = dict(zip(combn,[0]*len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [_add_label(t.read(chrom),tlabels[n]) for n,t in enumerate(tracks)]
                s = cobble(concatenate(streams),scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx]-x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(x[name_idx].split('|')))) # avoid 'A|A'
                    cb = [combinations(sub,k) for k in range(1,len(sub)+1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb: c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb: c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output,'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %(",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100*c2[c])/total_cov)
                    c1[c] = (100*c1[c])/total_cov
        else:
            raise ValueError("Input type '%s' not supported." %intype)


        if nsamples <= 4:
            format = kw.get('output') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.'+format)
            venn(c2,legend=legend,options=venn_options,output=output,format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output,'w') as summary:
            summary.write("%s\t%s\t%s\n" % ("Group","Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x:(len(x),x)):
                summary.write(record%(c,c1[c],c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()
Пример #5
0
    def __call__(self, **kw):
        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f " + s + ")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint, min(sys.maxint, num))
            num = (num, ) * c.count("%f")
            return eval(cond % (num))

        def _add_label(s, x):
            _f = s.fields + ['track_name']
            return FeatureStream((y + (x, ) for y in s), fields=_f)

        venn_options = {}  # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns', '')
            s_filters = kw.get('filters', '')
            infile = track(kw.get('table', ''), format='txt', header=True)
            col_ind = [int(i) - 1 for i in s_cols.split(",")]
            legend = [
                infile.fields[i] if i < len(infile.fields) else str(i)
                for i in col_ind
            ]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k + 65) for k in range(len(col_ind))]
            conds += ["1"] * (len(col_ind) - len(conds))
            combn = [
                tuple(sorted(x)) for k in range(len(tlabels))
                for x in combinations(tlabels, k + 1)
            ]
            c1 = dict(("|".join(c), 0) for c in combn)
            c2 = dict(("|".join(c), 0) for c in combn)
            indx = dict((c, [tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row, i, c) for i, c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n, t in enumerate(tests)
                                 if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            filenames = kw['TrMulti']['files']
            if not isinstance(filenames, (list, tuple)):
                filenames = [filenames]
            for f in filenames:
                assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f, chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k + 65) for k in range(len(tracks))]
            combn = [combinations(tlabels, k + 1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn, [0] * len(combn)))
            c2 = dict(zip(combn, [0] * len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [
                    _add_label(t.read(chrom), tlabels[n])
                    for n, t in enumerate(tracks)
                ]
                s = cobble(concatenate(streams), scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx] - x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(
                        x[name_idx].split('|'))))  # avoid 'A|A'
                    cb = [combinations(sub, k) for k in range(1, len(sub) + 1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb:
                            c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb:
                            c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output, 'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %
                                  (",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100 * c2[c]) / total_cov)
                    c1[c] = (100 * c1[c]) / total_cov
        else:
            raise ValueError("Input type '%s' not supported." % intype)

        if nsamples <= 4:
            format = kw.get('format') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.' + format)
            venn(c2,
                 legend=legend,
                 options=venn_options,
                 output=output,
                 format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output, 'w') as summary:
            summary.write("%s\t%s\t%s\n" %
                          ("Group", "Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x: (len(x), x)):
                summary.write(record % (c, c1[c], c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()