def _create_json(self):                    
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     samples = [ ]
     groups = [ ]
     for sample in self.samples:
         this_groups = [ ]
         for item in self.groups:
             if selection.matches(
                     selection.term_specification(item),
                     sample.tags + [ sample.output_dir ]
                     ):
                 this_groups.append(selection.term_name(item))
         group = ','.join(this_groups) if this_groups else 'ungrouped'
         if group not in groups: groups.append(group)
         
         item = {
             'name' : sample.output_dir,
             'bam' : os.path.abspath( 
                 workspace/('samples',sample.output_dir,'alignments_filtered_sorted.bam')
                 ),
             'group' : group,
             'tags' : sample.tags,
             }
         samples.append(item)
         
     obj = collections.OrderedDict()
     obj['reference'] = os.path.abspath( self.reference )
     obj['extension'] = self.extension
     obj['genes'] = os.path.abspath( workspace/('peaks','relation-parent.gff') )
     obj['peaks'] = os.path.abspath( workspace/('peaks','relation-child.gff') )
     obj['groups'] = groups
     obj['samples'] = samples
     
     with open(workspace/"plotter-config.json","wb") as f:
         json.dump(obj, f, indent=4)
Пример #2
0
 def _create_json(self):                    
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     samples = [ ]
     groups = [ ]
     for sample in self.samples:
         this_groups = [ ]
         for item in self.groups:
             if selection.matches(
                     selection.term_specification(item),
                     sample.tags + [ sample.output_dir ]
                     ):
                 this_groups.append(selection.term_name(item))
         group = ','.join(this_groups) if this_groups else 'ungrouped'
         if group not in groups: groups.append(group)
         
         item = {
             'name' : sample.output_dir,
             'bam' : os.path.abspath( 
                 workspace/('samples',sample.output_dir,'alignments_filtered_sorted.bam')
                 ),
             'group' : group,
             'tags' : sample.tags,
             }
         samples.append(item)
         
     obj = collections.OrderedDict()
     obj['reference'] = os.path.abspath( self.reference )
     obj['extension'] = self.extension
     obj['genes'] = os.path.abspath( workspace/('peaks','relation-parent.gff') )
     obj['peaks'] = os.path.abspath( workspace/('peaks','relation-child.gff') )
     obj['groups'] = groups
     obj['samples'] = samples
     
     with open(workspace/"plotter-config.json","wb") as f:
         json.dump(obj, f, indent=4)
 def run(self):
     data = io.read_grouped_table(
         self.counts,
         [('Count',str), ('Annotation',str), ('Tail_count',str), ('Tail',str), ('Proportion',str)],
         'Count',
         )
     
     features = data['Count'].keys()
     samples = data['Count'].value_type().keys()
     
     tags = { }
     for sample in samples:
         tags[sample] = [sample]        
     for line in data.comments:
         if line.startswith('#sampleTags='):
             parts = line[len('#sampleTags='):].split(',')
             tags[parts[0]] = parts
     
     group_names = [ ]
     groups = [ ]
     group_tags = [ ]
     
     for item in self.groups:
         select = selection.term_specification(item)
         name = selection.term_name(item)
         group = [ item for item in samples if selection.matches(select, tags[item]) ]
         assert group, 'Empty group: '+name
         
         this_group_tags = [ name ]
         for tag in tags[group[0]]:
             if tag == name: continue
             for item in group[1:]:
                 for item2 in tags[item]:
                     if tag not in item2: break
                 else:
                     this_group_tags.append(tag)
         
         group_names.append(name)
         groups.append(group)
         group_tags.append(this_group_tags)
     
     result = io.Grouped_table()
     result.comments = [ '#Counts' ]
     for item in group_tags:
         result.comments.append('#sampleTags='+','.join(item))
     
     
     count = [ ]
     tail_count = [ ]
     tail = [ ]
     proportion = [ ]
     for feature in features:
         this_count = [ ]
         this_tail_count = [ ]
         this_tail = [ ]
         this_proportion = [ ]
         for group in groups:
             this_this_count = [ ]
             this_this_tail_count = [ ]
             this_this_tail = [ ]
             this_this_proportion = [ ]
             for sample in group:
                 this_this_count.append(int(data['Count'][feature][sample]))
                 this_this_tail_count.append(int(data['Tail_count'][feature][sample]))
                 item = data['Tail'][feature][sample]
                 if item != 'NA': this_this_tail.append(float(item))
                 item = data['Proportion'][feature][sample]
                 if item != 'NA': this_this_proportion.append(float(item))
             
             this_count.append(str(sum(this_this_count)))
             this_tail_count.append(str(sum(this_this_tail_count)))
             this_tail.append(str(sum(this_this_tail)/len(this_this_tail)) if this_this_tail else 'NA')
             this_proportion.append(str(sum(this_this_proportion)/len(this_this_proportion)) if this_this_proportion else 'NA')
                 
         count.append(this_count)
         tail_count.append(this_tail_count)
         tail.append(this_tail)
         proportion.append(this_proportion)
     
     matrix = io.named_matrix_type(features,group_names)
     result['Count'] = matrix(count)
     result['Annotation'] = data['Annotation']
     result['Tail_count'] = matrix(tail_count)
     result['Tail'] = matrix(tail)
     result['Proportion'] = matrix(proportion)
     result.write_csv(self.prefix + '.csv')
   def run(self):
       assert self.method in ("limma", "fitnoise1", "fitnoise2"), "Unknown method."
       assert self.method != "limma" or not self.empirical_controls
       
       title = self.get_title()
   
       n_alt = len(self.alt)
       n_null = len(self.null)
       
       suffix = '-dedup' if self.dedup else ''
   
       genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv')
       genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv')

       primarypeakwise_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'counts.csv')
       primarypeakwise_norm_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'norm.csv')

       peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv')
       peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv')

       pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv')
       pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv')

   
       reader = io.Table_reader(genewise_filename, 'Count')
       reader.close()
       samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ]
       tags = { }
       for item in samples:
           tags[item] = [ item ]
       for line in reader.comments:
           if line.startswith('#sampleTags='):
               parts = line[len('#sampleTags='):].split(',')
               tags[parts[0]] = parts
              
       model = [ ]
       for term in self.alt + self.null:        
           spec = selection.term_specification(term)
           model.append([ selection.weight(spec, tags[item]) for item in samples ])
       model = zip(*model) #Transpose
       
       select = [ any(row) for row in model ]
       model = [ row for row,selected in zip(model,select) if selected ]
       model_columns = [ selection.term_name(item) for item in self.alt + self.null ]
       model_rows = [ item for keep, item in zip(select, samples) if keep ]
       
       #degust complains if name starts with '-', delimits with commas
       model_columns = [ ('.' if item[:1] == '-' else '') + item.replace(',',';') for item in model_columns ]
       
       pairs_n_alt = n_alt       
       pairs_select = select + select
       pairs_model = (
           [ (0,) * n_alt + row + (0,) for row in model ] +
           [ row[:n_alt]  + row + (1,) for row in model ] 
           )
       pairs_model_columns = (
           [ item+'-interaction' for item in model_columns[:n_alt] ] +
           model_columns +
           [ 'pair2' ]
           )
       pairs_model_rows = [ item+'-peak1' for item in model_rows ] + [ item+'-peak2' for item in model_rows ]
       
       
       design_str = '['+('-'*(8*n_alt-2))+'] test coefficients\n'
       for row, name in zip(model, model_rows):
           design_str += "%s %s\n" % (''.join('%7g ' % item for item in row), name)
       
       print
       print "Design matrix"
       print design_str
       print
       print 'Pair design matrix'
       print '['+('-'*(8*n_alt-2))+'] test coefficients'
       for row, name in zip(pairs_model, pairs_model_rows):
           print ''.join('%7g ' % item for item in row), name
       print
       
       
       workspace = self.get_workspace()
       
       runr.run_script(TEST_R, self.tell,
           DIR = workspace.working_dir,
           METHOD = self.method,
           WEIGHT = self.weight,
           EMPIRICAL_CONTROLS = self.empirical_controls,
           MIN_READS = self.min_reads,
           BIOTYPE = self.biotype,
           RELATION = self.relation,
           QUANTILE_TAIL = self.quantile_tail,
           DO_EXPRESSION = self.do_expression,
           DO_TAIL_LENGTH = self.do_tail_length,
           VERBOSE = self.verbose,
           
           GENEWISE_FILENAME = genewise_filename,
           GENEWISE_NORM_FILENAME = genewise_norm_filename,
           PRIMARYPEAKWISE_FILENAME = primarypeakwise_filename,
           PRIMARYPEAKWISE_NORM_FILENAME = primarypeakwise_norm_filename,
           PEAKWISE_FILENAME = peakwise_filename,
           PEAKWISE_NORM_FILENAME = peakwise_norm_filename,
           PAIRWISE_FILENAME = pairwise_filename,
           PAIRWISE_NORM_FILENAME = pairwise_norm_filename,
           
           N_ALT = n_alt,
           SELECT = select,
           MODEL = model,
           MODEL_COLUMNS = model_columns,
           PAIRS_N_ALT = pairs_n_alt,
           PAIRS_SELECT = pairs_select,
           PAIRS_MODEL = pairs_model,
           PAIRS_MODEL_COLUMNS = pairs_model_columns,
           )
       if self.tell: return
       
       reporter = reporting.Reporter(workspace.working_dir, title, style=web.style())
       
       if self.dedup:
           reporter.p('Read deduplication was used.')
       
       reporter.write('<table>\n')
       for is_expression, entities, result, aveexpr, subtitle, terms in [
           (True, 'genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]),
           (False, 'genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]),
           (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression', 'Primary-peakwise expression level', model_columns[:n_alt]),
           (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail', 'Primary-peakwise tail length', model_columns[:n_alt]),
           (True, 'peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]),
           (False, 'peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]),
           (True, 'peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]),
           (False, 'peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]),
           ]:
           #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
           #n = 0
           #n_01 = 0
           #n_05 = 0
           #for row in data.values():
           #    fdr = float(row['adj.P.Val'])
           #    if fdr <= 0.01: n_01 += 1
           #    if fdr <= 0.05: n_05 += 1
           #    n += 1
           
           if is_expression and not self.do_expression: continue
           if not is_expression and not self.do_tail_length: continue
           
           io.execute([
               'degust.py',
               '--name', title + ' : ' + subtitle,
               '--avg', aveexpr,
               '--primary', 'baseline',
               '--logFC', ','.join(terms),
               '--fdr', 'adj.P.Val',
               '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr,
               '--notour', '1',
               '--out', workspace/(result+'.html'),
               workspace/(result+'-toptable.csv'),
               ])

           with open(workspace/(result+'.txt'),'rU') as f:
               lines = f.readlines()
           
           reporter.write('<tr><td valign="top" width="33%">')
           reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) )
           #reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
           line = reporter.href(workspace/(result+'-toptable.csv'), 'Spreadsheet')
           if result.endswith('voom'):
               line += ', ' + reporter.href(workspace/(result+'.png'), 'voom plot')
           reporter.p(line)
           for line in lines[-2:]:
               reporter.p(line.strip())
           reporter.write('</td><td valign="top"><br/><br/>')
           for line in lines[:-2]:
               reporter.write(line.strip() + '<br/>\n')
           reporter.write('</td></tr>')

       reporter.write('</table>\n')
       
       reporter.subheading("Design matrix")
       
       reporter.write('<pre>' + design_str + '</pre>')
       
       reporter.close()
    def run(self):
        data = io.read_grouped_table(
            self.counts,
            [("Count", str), ("Annotation", str), ("Tail_count", str), ("Tail", str), ("Proportion", str)],
            "Count",
        )

        features = data["Count"].keys()
        samples = data["Count"].value_type().keys()

        tags = {}
        for sample in samples:
            tags[sample] = [sample]
        for line in data.comments:
            if line.startswith("#sampleTags="):
                parts = line[len("#sampleTags=") :].split(",")
                tags[parts[0]] = parts

        group_names = []
        groups = []
        group_tags = []

        for item in self.groups:
            select = selection.term_specification(item)
            name = selection.term_name(item)
            group = [item for item in samples if selection.matches(select, tags[item])]
            assert group, "Empty group: " + name

            this_group_tags = [name]
            for tag in tags[group[0]]:
                if tag == name:
                    continue
                for item in group[1:]:
                    for item2 in tags[item]:
                        if tag not in item2:
                            break
                    else:
                        this_group_tags.append(tag)

            group_names.append(name)
            groups.append(group)
            group_tags.append(this_group_tags)

        result = io.Grouped_table()
        result.comments = ["#Counts"]
        for item in group_tags:
            result.comments.append("#sampleTags=" + ",".join(item))

        count = []
        tail_count = []
        tail = []
        proportion = []
        for feature in features:
            this_count = []
            this_tail_count = []
            this_tail = []
            this_proportion = []
            for group in groups:
                this_this_count = []
                this_this_tail_count = []
                this_this_tail = []
                this_this_proportion = []
                for sample in group:
                    this_this_count.append(int(data["Count"][feature][sample]))
                    this_this_tail_count.append(int(data["Tail_count"][feature][sample]))
                    item = data["Tail"][feature][sample]
                    if item != "NA":
                        this_this_tail.append(float(item))
                    item = data["Proportion"][feature][sample]
                    if item != "NA":
                        this_this_proportion.append(float(item))

                this_count.append(str(sum(this_this_count)))
                this_tail_count.append(str(sum(this_this_tail_count)))
                this_tail.append(str(sum(this_this_tail) / len(this_this_tail)) if this_this_tail else "NA")
                this_proportion.append(
                    str(sum(this_this_proportion) / len(this_this_proportion)) if this_this_proportion else "NA"
                )

            count.append(this_count)
            tail_count.append(this_tail_count)
            tail.append(this_tail)
            proportion.append(this_proportion)

        matrix = io.named_matrix_type(features, group_names)
        result["Count"] = matrix(count)
        result["Annotation"] = data["Annotation"]
        result["Tail_count"] = matrix(tail_count)
        result["Tail"] = matrix(tail)
        result["Proportion"] = matrix(proportion)
        result.write_csv(self.prefix + ".csv")
    def run(self):
        assert self.method in ("limma", "fitnoise1", "fitnoise2"), "Unknown method."
        assert self.method != "limma" or not self.empirical_controls

        title = self.get_title()

        n_alt = len(self.alt)
        n_null = len(self.null)

        suffix = "-dedup" if self.dedup else ""

        genewise_filename = join(self.analysis, "expression", "genewise" + suffix, "counts.csv")
        genewise_norm_filename = join(self.analysis, "expression", "genewise" + suffix, "norm.csv")

        primarypeakwise_filename = join(self.analysis, "expression", "primarypeakwise" + suffix, "counts.csv")
        primarypeakwise_norm_filename = join(self.analysis, "expression", "primarypeakwise" + suffix, "norm.csv")

        peakwise_filename = join(self.analysis, "expression", "peakwise" + suffix, "counts.csv")
        peakwise_norm_filename = join(self.analysis, "expression", "peakwise" + suffix, "norm.csv")

        pairwise_filename = join(self.analysis, "peak-shift" + suffix, "individual-pairs.csv")
        pairwise_norm_filename = join(self.analysis, "peak-shift" + suffix, "individual-pairs-norm.csv")

        reader = io.Table_reader(genewise_filename, "Count")
        reader.close()
        samples = [item for i, item in enumerate(reader.headings) if reader.groups[i] == "Count"]
        tags = {}
        for item in samples:
            tags[item] = [item]
        for line in reader.comments:
            if line.startswith("#sampleTags="):
                parts = line[len("#sampleTags=") :].split(",")
                tags[parts[0]] = parts

        model = []
        for term in self.alt + self.null:
            spec = selection.term_specification(term)
            model.append([selection.weight(spec, tags[item]) for item in samples])
        model = zip(*model)  # Transpose

        select = [any(row) for row in model]
        model = [row for row, selected in zip(model, select) if selected]
        model_columns = [selection.term_name(item) for item in self.alt + self.null]
        model_rows = [item for keep, item in zip(select, samples) if keep]

        # degust complains if name starts with '-', delimits with commas
        model_columns = [("." if item[:1] == "-" else "") + item.replace(",", ";") for item in model_columns]

        pairs_n_alt = n_alt
        pairs_select = select + select
        pairs_model = [(0,) * n_alt + row + (0,) for row in model] + [row[:n_alt] + row + (1,) for row in model]
        pairs_model_columns = [item + "-interaction" for item in model_columns[:n_alt]] + model_columns + ["pair2"]
        pairs_model_rows = [item + "-peak1" for item in model_rows] + [item + "-peak2" for item in model_rows]

        design_str = "[" + ("-" * (8 * n_alt - 2)) + "] test coefficients\n"
        for row, name in zip(model, model_rows):
            design_str += "%s %s\n" % ("".join("%7g " % item for item in row), name)

        print
        print "Design matrix"
        print design_str
        print
        print "Pair design matrix"
        print "[" + ("-" * (8 * n_alt - 2)) + "] test coefficients"
        for row, name in zip(pairs_model, pairs_model_rows):
            print "".join("%7g " % item for item in row), name
        print

        workspace = self.get_workspace()

        runr.run_script(
            TEST_R,
            self.tell,
            SOURCE=os.path.join(os.path.dirname(__file__), "tail_tools.R"),
            DIR=workspace.working_dir,
            METHOD=self.method,
            WEIGHT=self.weight,
            EMPIRICAL_CONTROLS=self.empirical_controls,
            MIN_READS=self.min_reads,
            BIOTYPE=self.biotype,
            RELATION=self.relation,
            QUANTILE_TAIL=self.quantile_tail,
            DO_EXPRESSION=self.do_expression,
            DO_TAIL_LENGTH=self.do_tail_length,
            VERBOSE=self.verbose,
            GENEWISE_FILENAME=genewise_filename,
            GENEWISE_NORM_FILENAME=genewise_norm_filename,
            PRIMARYPEAKWISE_FILENAME=primarypeakwise_filename,
            PRIMARYPEAKWISE_NORM_FILENAME=primarypeakwise_norm_filename,
            PEAKWISE_FILENAME=peakwise_filename,
            PEAKWISE_NORM_FILENAME=peakwise_norm_filename,
            PAIRWISE_FILENAME=pairwise_filename,
            PAIRWISE_NORM_FILENAME=pairwise_norm_filename,
            N_ALT=n_alt,
            SELECT=select,
            MODEL=model,
            MODEL_COLUMNS=model_columns,
            PAIRS_N_ALT=pairs_n_alt,
            PAIRS_SELECT=pairs_select,
            PAIRS_MODEL=pairs_model,
            PAIRS_MODEL_COLUMNS=pairs_model_columns,
        )
        if self.tell:
            return

        reporter = reporting.Reporter(workspace.working_dir, title, style=web.style())

        if self.dedup:
            reporter.p("Read deduplication was used.")

        reporter.write("<table>\n")
        for is_expression, entities, result, aveexpr, subtitle, terms in [
            (True, "genes", "genewise-voom", "avg.expression", "Genewise expression level", model_columns[:n_alt]),
            (False, "genes", "genewise-tail", "avg.tail", "Genewise tail length", model_columns[:n_alt]),
            (
                True,
                "primary peaks",
                "primarypeakwise-voom",
                "avg.expression",
                "Primary-peakwise expression level",
                model_columns[:n_alt],
            ),
            (
                False,
                "primary peaks",
                "primarypeakwise-tail",
                "avg.tail",
                "Primary-peakwise tail length",
                model_columns[:n_alt],
            ),
            (True, "peaks", "peakwise-voom", "avg.expression", "Peakwise expression level", model_columns[:n_alt]),
            (False, "peaks", "peakwise-tail", "avg.tail", "Peakwise tail length", model_columns[:n_alt]),
            (
                True,
                "peak pairs",
                "pairwise-voom",
                "avg.expression",
                "Peak-pair expression shift",
                pairs_model_columns[:n_alt],
            ),
            (
                False,
                "peak pairs",
                "pairwise-tail",
                "avg.tail",
                "Peak-pair tail length shift",
                pairs_model_columns[:n_alt],
            ),
        ]:
            # data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
            # n = 0
            # n_01 = 0
            # n_05 = 0
            # for row in data.values():
            #    fdr = float(row['adj.P.Val'])
            #    if fdr <= 0.01: n_01 += 1
            #    if fdr <= 0.05: n_05 += 1
            #    n += 1

            if is_expression and not self.do_expression:
                continue
            if not is_expression and not self.do_tail_length:
                continue

            io.execute(
                [
                    "degust.py",
                    "--name",
                    title + " : " + subtitle,
                    "--avg",
                    aveexpr,
                    "--primary",
                    "baseline",
                    "--logFC",
                    ",".join(terms),
                    "--fdr",
                    "adj.P.Val",
                    "--info",
                    "gene,locus_tag,product,reads,polya.reads,tail.lengths," + aveexpr,
                    "--notour",
                    "1",
                    "--out",
                    workspace / (result + ".html"),
                    workspace / (result + "-toptable.csv"),
                ]
            )

            with open(workspace / (result + ".txt"), "rU") as f:
                lines = f.readlines()

            reporter.write('<tr><td valign="top" width="33%">')
            reporter.subheading(reporter.href(workspace / (result + ".html"), subtitle))
            # reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
            line = reporter.href(workspace / (result + "-toptable.csv"), "Spreadsheet")
            if result.endswith("voom"):
                line += ", " + reporter.href(workspace / (result + ".png"), "voom plot")
            reporter.p(line)
            for line in lines[-2:]:
                reporter.p(line.strip())
            reporter.write('</td><td valign="top"><br/><br/>')
            for line in lines[:-2]:
                reporter.write(line.strip() + "<br/>\n")
            reporter.write("</td></tr>")

        reporter.write("</table>\n")

        reporter.subheading("Design matrix")

        reporter.write("<pre>" + design_str + "</pre>")

        reporter.close()
Пример #7
0
    def run(self):
        data = io.read_grouped_table(
            self.counts,
            [('Count', str), ('Annotation', str), ('Tail_count', str),
             ('Tail', str), ('Proportion', str)],
            'Count',
        )

        features = data['Count'].keys()
        samples = data['Count'].value_type().keys()

        tags = {}
        for sample in samples:
            tags[sample] = [sample]
        for line in data.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                tags[parts[0]] = parts

        group_names = []
        groups = []
        group_tags = []

        for item in self.groups:
            select = selection.term_specification(item)
            name = selection.term_name(item)
            group = [
                item for item in samples
                if selection.matches(select, tags[item])
            ]
            assert group, 'Empty group: ' + name

            this_group_tags = [name]
            for tag in tags[group[0]]:
                if tag == name: continue
                for item in group[1:]:
                    for item2 in tags[item]:
                        if tag not in item2: break
                    else:
                        this_group_tags.append(tag)

            group_names.append(name)
            groups.append(group)
            group_tags.append(this_group_tags)

        result = io.Grouped_table()
        result.comments = ['#Counts']
        for item in group_tags:
            result.comments.append('#sampleTags=' + ','.join(item))

        count = []
        tail_count = []
        tail = []
        proportion = []
        for feature in features:
            this_count = []
            this_tail_count = []
            this_tail = []
            this_proportion = []
            for group in groups:
                this_this_count = []
                this_this_tail_count = []
                this_this_tail = []
                this_this_proportion = []
                for sample in group:
                    this_this_count.append(int(data['Count'][feature][sample]))
                    this_this_tail_count.append(
                        int(data['Tail_count'][feature][sample]))
                    item = data['Tail'][feature][sample]
                    if item != 'NA': this_this_tail.append(float(item))
                    item = data['Proportion'][feature][sample]
                    if item != 'NA': this_this_proportion.append(float(item))

                this_count.append(str(sum(this_this_count)))
                this_tail_count.append(str(sum(this_this_tail_count)))
                this_tail.append(
                    str(sum(this_this_tail) /
                        len(this_this_tail)) if this_this_tail else 'NA')
                this_proportion.append(
                    str(sum(this_this_proportion) / len(this_this_proportion)
                        ) if this_this_proportion else 'NA')

            count.append(this_count)
            tail_count.append(this_tail_count)
            tail.append(this_tail)
            proportion.append(this_proportion)

        matrix = io.named_matrix_type(features, group_names)
        result['Count'] = matrix(count)
        result['Annotation'] = data['Annotation']
        result['Tail_count'] = matrix(tail_count)
        result['Tail'] = matrix(tail)
        result['Proportion'] = matrix(proportion)
        result.write_csv(self.prefix + '.csv')
Пример #8
0
    def run(self):
        assert self.method in ("limma", "fitnoise1",
                               "fitnoise2"), "Unknown method."
        assert self.method != "limma" or not self.empirical_controls

        title = self.get_title()

        n_alt = len(self.alt)
        n_null = len(self.null)

        suffix = '-dedup' if self.dedup else ''

        genewise_filename = join(self.analysis, 'expression',
                                 'genewise' + suffix, 'counts.csv')
        genewise_norm_filename = join(self.analysis, 'expression',
                                      'genewise' + suffix, 'norm.csv')

        primarypeakwise_filename = join(self.analysis, 'expression',
                                        'primarypeakwise' + suffix,
                                        'counts.csv')
        primarypeakwise_norm_filename = join(self.analysis, 'expression',
                                             'primarypeakwise' + suffix,
                                             'norm.csv')

        peakwise_filename = join(self.analysis, 'expression',
                                 'peakwise' + suffix, 'counts.csv')
        peakwise_norm_filename = join(self.analysis, 'expression',
                                      'peakwise' + suffix, 'norm.csv')

        pairwise_filename = join(self.analysis, 'peak-shift' + suffix,
                                 'individual-pairs.csv')
        pairwise_norm_filename = join(self.analysis, 'peak-shift' + suffix,
                                      'individual-pairs-norm.csv')

        reader = io.Table_reader(genewise_filename, 'Count')
        reader.close()
        samples = [
            item for i, item in enumerate(reader.headings)
            if reader.groups[i] == 'Count'
        ]
        tags = {}
        for item in samples:
            tags[item] = [item]
        for line in reader.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                tags[parts[0]] = parts

        model = []
        for term in self.alt + self.null:
            spec = selection.term_specification(term)
            model.append(
                [selection.weight(spec, tags[item]) for item in samples])
        model = zip(*model)  #Transpose

        select = [any(row) for row in model]
        model = [row for row, selected in zip(model, select) if selected]
        model_columns = [
            selection.term_name(item) for item in self.alt + self.null
        ]
        model_rows = [item for keep, item in zip(select, samples) if keep]

        #degust complains if name starts with '-', delimits with commas
        model_columns = [
            ('.' if item[:1] == '-' else '') + item.replace(',', ';')
            for item in model_columns
        ]

        pairs_n_alt = n_alt
        pairs_select = select + select
        pairs_model = ([(0, ) * n_alt + row + (0, ) for row in model] +
                       [row[:n_alt] + row + (1, ) for row in model])
        pairs_model_columns = (
            [item + '-interaction'
             for item in model_columns[:n_alt]] + model_columns + ['pair2'])
        pairs_model_rows = [item + '-peak1' for item in model_rows
                            ] + [item + '-peak2' for item in model_rows]

        design_str = '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients\n'
        for row, name in zip(model, model_rows):
            design_str += "%s %s\n" % (''.join('%7g ' % item
                                               for item in row), name)

        print
        print "Design matrix"
        print design_str
        print
        print 'Pair design matrix'
        print '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients'
        for row, name in zip(pairs_model, pairs_model_rows):
            print ''.join('%7g ' % item for item in row), name
        print

        workspace = self.get_workspace()

        runr.run_script(
            TEST_R,
            self.tell,
            DIR=workspace.working_dir,
            METHOD=self.method,
            WEIGHT=self.weight,
            EMPIRICAL_CONTROLS=self.empirical_controls,
            MIN_READS=self.min_reads,
            BIOTYPE=self.biotype,
            RELATION=self.relation,
            QUANTILE_TAIL=self.quantile_tail,
            DO_EXPRESSION=self.do_expression,
            DO_TAIL_LENGTH=self.do_tail_length,
            VERBOSE=self.verbose,
            GENEWISE_FILENAME=genewise_filename,
            GENEWISE_NORM_FILENAME=genewise_norm_filename,
            PRIMARYPEAKWISE_FILENAME=primarypeakwise_filename,
            PRIMARYPEAKWISE_NORM_FILENAME=primarypeakwise_norm_filename,
            PEAKWISE_FILENAME=peakwise_filename,
            PEAKWISE_NORM_FILENAME=peakwise_norm_filename,
            PAIRWISE_FILENAME=pairwise_filename,
            PAIRWISE_NORM_FILENAME=pairwise_norm_filename,
            N_ALT=n_alt,
            SELECT=select,
            MODEL=model,
            MODEL_COLUMNS=model_columns,
            PAIRS_N_ALT=pairs_n_alt,
            PAIRS_SELECT=pairs_select,
            PAIRS_MODEL=pairs_model,
            PAIRS_MODEL_COLUMNS=pairs_model_columns,
        )
        if self.tell: return

        reporter = reporting.Reporter(workspace.working_dir,
                                      title,
                                      style=web.style())

        if self.dedup:
            reporter.p('Read deduplication was used.')

        reporter.write('<table>\n')
        for is_expression, entities, result, aveexpr, subtitle, terms in [
            (True, 'genes', 'genewise-voom', 'avg.expression',
             'Genewise expression level', model_columns[:n_alt]),
            (False, 'genes', 'genewise-tail', 'avg.tail',
             'Genewise tail length', model_columns[:n_alt]),
            (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression',
             'Primary-peakwise expression level', model_columns[:n_alt]),
            (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail',
             'Primary-peakwise tail length', model_columns[:n_alt]),
            (True, 'peaks', 'peakwise-voom', 'avg.expression',
             'Peakwise expression level', model_columns[:n_alt]),
            (False, 'peaks', 'peakwise-tail', 'avg.tail',
             'Peakwise tail length', model_columns[:n_alt]),
            (True, 'peak pairs', 'pairwise-voom', 'avg.expression',
             'Peak-pair expression shift', pairs_model_columns[:n_alt]),
            (False, 'peak pairs', 'pairwise-tail', 'avg.tail',
             'Peak-pair tail length shift', pairs_model_columns[:n_alt]),
        ]:
            #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
            #n = 0
            #n_01 = 0
            #n_05 = 0
            #for row in data.values():
            #    fdr = float(row['adj.P.Val'])
            #    if fdr <= 0.01: n_01 += 1
            #    if fdr <= 0.05: n_05 += 1
            #    n += 1

            if is_expression and not self.do_expression: continue
            if not is_expression and not self.do_tail_length: continue

            io.execute([
                'degust.py',
                '--name',
                title + ' : ' + subtitle,
                '--avg',
                aveexpr,
                '--primary',
                'baseline',
                '--logFC',
                ','.join(terms),
                '--fdr',
                'adj.P.Val',
                '--info',
                'gene,locus_tag,product,reads,polya.reads,tail.lengths,' +
                aveexpr,
                '--notour',
                '1',
                '--out',
                workspace / (result + '.html'),
                workspace / (result + '-toptable.csv'),
            ])

            with open(workspace / (result + '.txt'), 'rU') as f:
                lines = f.readlines()

            reporter.write('<tr><td valign="top" width="33%">')
            reporter.subheading(
                reporter.href(workspace / (result + '.html'), subtitle))
            #reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
            line = reporter.href(workspace / (result + '-toptable.csv'),
                                 'Spreadsheet')
            if result.endswith('voom'):
                line += ', ' + reporter.href(workspace /
                                             (result + '.png'), 'voom plot')
            reporter.p(line)
            for line in lines[-2:]:
                reporter.p(line.strip())
            reporter.write('</td><td valign="top"><br/><br/>')
            for line in lines[:-2]:
                reporter.write(line.strip() + '<br/>\n')
            reporter.write('</td></tr>')

        reporter.write('</table>\n')

        reporter.subheading("Design matrix")

        reporter.write('<pre>' + design_str + '</pre>')

        reporter.close()