示例#1
0
 def get_reference(self):
     if 'reference' in self.param:
         path = self.relative_path_as_path(self.param['reference'])
     else:
         path = self.working_dir
     
     return reference_directory.Reference(path, must_exist=True)
示例#2
0
    def run(self):
        workspace = self.get_workspace()

        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()

        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)

        filenames = [workspace / (item + '.fa') for item in reader.samples]
        for filename in filenames:
            with open(filename, 'wb'):
                pass

        for name, seq in io.read_sequences(
                reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):
                revised = []
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(
                    ), 'Unsupported genotype (can only use haploid genotypes): ' + gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number - 1])
                        assert re.match(
                            '[ACGTN]*$',
                            var_seq), 'Unsupported variant type: ' + var_seq
                    new_pos = variant.POS - 1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos + len(variant.REF)].upper(
                    ) == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])

                with open(filenames[i], 'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]
        assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join(
            variants)
示例#3
0
    def run(self):
        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        jar = io.find_jar('snpEff.jar')

        with open(self.prefix + '.vcf', 'wb') as f:
            io.execute('java -jar JAR eff GENOME VCF -c CONFIG',
                       JAR=jar,
                       GENOME=reference.name,
                       VCF=self.vcf,
                       CONFIG=reference / 'snpeff.config',
                       stdout=f)

        index_vcf(self.prefix + '.vcf')
示例#4
0
    def get_context(self):
        assert self.reference is not None, 'reference not given.'

        context = Context()
        context.action = self
        context.space = self.get_workspace()
        context.name = context.space.name
        context.sample_space = workspace.Workspace(context.space / 'samples',
                                                   False)
        context.reference = reference_directory.Reference(self.reference, True)

        for sample in self.samples:
            assert sample.reference is None, 'reference should not be specified within sample.'
            assert sample.output_dir, 'sample given without name.'
            assert os.path.sep not in sample.output_dir, 'sample name contains ' + os.path.sep
        context.samples = [
            sample(
                output_dir=context.sample_space / sample.output_dir,
                reference=self.reference,
            ) for sample in self.samples
        ]
        context.sample_dirs = [item.output_dir for item in context.samples]

        if not self.variants:
            context.variants = None
        else:
            context.variants = self.variants(
                context.space / 'variants',
                self.reference,
                samples=context.sample_dirs,
                analysis=self.variants.analysis or self.
                template,  #Use aligner from template unless aligner explicitly given
            )

        if not self.expression:
            context.expression = None
        else:
            context.expression = self.expression(
                context.space / 'expression',
                samples=context.sample_dirs,
            )

        return context
示例#5
0
文件: igv.py 项目: promodel/nesoni
    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if os.path.isdir(filename):
                reference = reference_directory.Reference(filename,must_exist=True)
                sequences.append(reference.reference_fasta_filename())
                if reference.annotations_filename():
                    annotations.append(reference.annotations_filename())
                any = True
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'
        
        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               select=self.select
               ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
示例#6
0
 def __init__(self, dirname):
     self.dirname = dirname
     self.ref = reference_directory.Reference(dirname, must_exist=True)
示例#7
0
    def run(self):
        #===============================================
        #                Sanity checks
        #===============================================
        
        assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name."
        
        all_inputs = [ ]
        for sample in self.samples:
            all_inputs.extend(sample.reads)
        assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename."
        
        assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name."
        
        for test in self.tests:
            assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically"
        
        #===============================================
        #                Run pipeline
        #===============================================
        
        names = [ sample.output_dir for sample in self.samples ]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        
        self._create_json()
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = self.types,
            parts = self.parts
            )
        
        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        job_gene_counts = analyse_template(
            output_dir = expressionspace/'genewise',
            extension = self.extension,
            title = 'Genewise expression - ' + self.title,
            file_prefix = file_prefix+'genewise-',
            ).make
        
        job_peaks = _call(self._run_peaks, 
            workspace=workspace, 
            expressionspace=expressionspace, 
            reference=reference, 
            dirs = dirs,
            analyse_template = analyse_template,
            file_prefix=file_prefix,
            )
        
        job_norm = nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make
            
        job_bigwig = bigwig.Polya_bigwigs(
            workspace/'bigwigs', 
            working_dirs = dirs, 
            norm_file = workspace/"norm.csv",
            peaks_file = workspace/("peaks", "relation-child.gff"),
            title = "IGV tracks - "+self.title
            ).make
        
        job_norm_bigwig = _call(_serial, job_norm, job_bigwig)

        job_utrs = tail_tools.Call_utrs(
            workspace/('peaks','primary-peak'),
            self.reference,
            self.output_dir,
            extension=self.extension
            ).make
            
        job_primpeak_counts = analyse_template(
            expressionspace/'primarypeakwise',
            annotations=workspace/('peaks','primary-peak-peaks.gff'), 
            extension=0,
            types='peak',
            parts='peak',
            title='Primary-peakwise expression - ' + self.title,
            file_prefix=file_prefix+'primarypeakwise-',
            ).make
        
        job_primpeak = _call(_serial, job_utrs, job_primpeak_counts)
        
        job_peak_primpeak_bigwig = _call(_serial, 
            job_peaks, 
            _call(_parallel, job_norm_bigwig, job_primpeak))
        
        job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig)
            
        test_jobs = [ ]
        for test in self.tests:
            test_jobs.append(test(
                output_dir = testspace/test.output_dir,
                analysis = self.output_dir,
                ).make)

        job_test = _call(_parallel, *test_jobs)

        job_raw = self._extract_raw

        job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test))        
        
        job_all()



        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style())
        
        io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs')
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">&rarr; Load tracks into IGV</a></div>')

        tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run()
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">&rarr; Interactive report (shiny)</a></div>')
        
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )
        

        r.heading('Genewise expression')
        
        r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).")
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')


        r.heading('Peakwise expression')
        
        r.p("This shows results from all called peaks.")
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        self._describe_peaks(r)
        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')


        r.subheading('Primary-peakwise expression')
        
        r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension)
        
        io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise')
        r.p('<a href="primarypeakwise/index.html">&rarr; Primary-peakwise expression</a>')

        r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.')
        r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.')
        r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.')


        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    % (test.output_dir, test.get_title()))


        web.Geneview_webapp(r.workspace/'view').run()        
                
        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.get(workspace/('peak-shift','grouped.json'))
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.get(workspace/('peak-shift','individual.json'))
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
        
        
        r.heading('Raw data')
        
        r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv'))))
        
        r.write('<ul>\n')
        r.write('<li> -info.csv = gene name and product, etc\n')
        r.write('<li> -count.csv = read count\n')
        r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n')
        r.write('<li> -tail.csv = average poly(A) tail length\n')
        r.write('<li> -tail-count.csv = poly(A) read count\n')
        r.write('<li> -proportion.csv = proportion of reads with poly(A)\n')
        r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n')
        r.write('</ul>\n')

        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension)

        r.write('<p/><hr>\n')
        r.subheading('About normalization and log transformation')

        r.p('Counts are converted to '
            'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation '
            'for the negative binomial distribution, implemented in '
            'R package "varistran".')
                
        r.write('<p/><hr>\n')

        r.p('Reference directory '+self.reference)
        r.p('Tail Tools version '+tail_tools.VERSION)
        r.p('Nesoni version '+nesoni.VERSION)
        
        r.close()