Exemplos de TabFile.TabFile em Python, exemplos de bcftbx.TabFile.TabFile.TabFile em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: reporting.py Projeto: fls-bioinformatics-core/auto_process_ngs

 def add_per_lane_statistics(self):
     """
     Add a section with the per-lane statistics
     """
     # Per-lane statistics
     if not os.path.exists(self._per_lane_stats_file):
         logger.debug("No per-lane statistics file found")
         return
     per_lane_stats = self.add_section("Per-lane statistics",
                                       name="per_lane_stats")
     stats = TabFile(self._per_lane_stats_file, first_line_is_header=True)
     tbl = Table(columns=stats.header())
     tbl.append_columns("Assigned/unassigned")
     for line in stats:
         n = tbl.add_row()
         for c in stats.header():
             if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                 value = pretty_print_reads(line[c])
             else:
                 value = line[c]
             tbl.set_value(n, c, value)
         tbl.set_value(
             n, "Assigned/unassigned",
             Img(
                 ustackedbar(
                     (line["Assigned reads"], line["Unassigned reads"]),
                     length=100,
                     height=15,
                     colors=('red', 'white'),
                     inline=True)))
     per_lane_stats.add(tbl)
     self.add_to_toc("Per-lane statistics", per_lane_stats)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: fastq_screen.py Projeto: zacapte/auto_process_ngs

    def __init__(self, screen_file):
        """
        Create a new FastqscreenData instance

        """
        TabFile.__init__(self,
                         column_names=(
                             'Library',
                             '%Unmapped',
                             '%One_hit_one_library',
                             '%Multiple_hits_one_library',
                             '%One_hit_multiple_libraries',
                             '%Multiple_hits_multiple_libraries',
                         ))
        self._screen_file = os.path.abspath(screen_file)
        self._version = None
        self._no_hits = None
        # Read in data
        with open(self._screen_file, 'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#Fastq_screen version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('Library') or line.startswith('Genome'):
                    tabfile = TabFile(column_names=line.split())
                    continue
                elif line.startswith('%Hit_no_libraries:') or \
                     line.startswith('%Hit_no_genomes:'):
                    self._no_hits = float(line.split()[-1])
                    continue
                elif not line or \
                   line.startswith('#') or \
                   line.startswith('%'):
                    continue
                tabfile.append(tabdata=line)
        # Handle different terminology for different versions
        if tabfile.header()[0] == 'Library':
            library = 'Library'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_library'
            multiple_hits_one_library = '%Multiple_hits_one_library'
            one_hit_multiple_libraries = '%One_hit_multiple_libraries'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries'
        elif tabfile.header()[0] == 'Genome':
            library = 'Genome'
            unmapped = '%Unmapped'
            one_hit_one_library = '%One_hit_one_genome'
            multiple_hits_one_library = '%Multiple_hits_one_genome'
            one_hit_multiple_libraries = '%One_hit_multiple_genomes'
            multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes'
        # Copy data to main object
        for line in tabfile:
            data = [
                line[library], line[unmapped], line[one_hit_one_library],
                line[multiple_hits_one_library],
                line[one_hit_multiple_libraries],
                line[multiple_hits_multiple_libraries]
            ]
            self.append(data=data)

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self, filen=None, fp=None, name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen, 'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith(
                        "# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0, "order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception, "Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

Exemplo n.º 4

0

Exibir arquivo

    def __init__(self, well_list_file):
        """
        Create a new ICell8WellList instance.

        Arguments:
          well_list_file (str): path to the well list
            file.
        """
        self._data = TabFile(filen=well_list_file, first_line_is_header=True)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: fastq_strand.py Projeto: nandr0id/auto_process_ngs

    def __init__(self,fastq_strand_out):
        """
        Create a new Fastqstrand instance

        """
        self._fastq_strand_out = os.path.abspath(fastq_strand_out)
        self._version = None
        self._genomes = AttributeDictionary()
        # Read in data
        tabfile = None
        with open(self._fastq_strand_out,'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#fastq_strand version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('#Genome'):
                    tabfile = TabFile(column_names=line[1:].split('\t'))
                    continue
                tabfile.append(tabdata=line)
        # Check there is some data
        if tabfile is None:
            raise Exception("Unable to extract fastq_strand data from %s" %
                            self._fastq_strand_out)
        # Copy data to main object
        for line in tabfile:
            # Store the data
            data = AttributeDictionary()
            self._genomes[line['Genome']] = data
            data['forward'] = line['1st forward']
            data['reverse'] = line['2nd reverse']
            # Additional processing
            if data.reverse > 0.0:
                ratio = float(data.forward)/float(data.reverse)
            elif data.forward > 0.0:
                ratio = float("+inf")
            else:
                ratio = None
            if ratio is not None:
                if ratio < 0.2:
                    strandedness = "reverse"
                elif ratio > 5 or ratio == float("+inf"):
                    strandedness = "forward"
                else:
                    strandedness = "unstranded?"
            else:
                strandedness = "undetermined"
            data['ratio'] = ratio
            data['strandedness'] = strandedness

Exemplo n.º 6

0

Exibir arquivo

Arquivo: stats.py Projeto: nandr0id/auto_process_ngs

    def report_basic_stats(self,out_file=None,fp=None):
        """
        Report the 'basic' statistics

        For each FASTQ file, report the following information:

        - Project name
        - Sample name
        - FASTQ file name (without leading directory)
        - Size (human-readable)
        - Nreads (number of reads)
        - Paired_end ('Y' for paired-end, 'N' for single-end)

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file,'w')
        else:
            fpp = fp
        # Report
        stats = TabFile(column_names=('Project',
                                      'Sample',
                                      'Fastq',
                                      'Size',
                                      'Nreads',
                                      'Paired_end'))
        for line in self._stats:
            data = [line[c] for c in stats.header()]
            stats.append(data=data)
        stats.write(fp=fpp,include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: processing.py Projeto: zacapte/auto_process_ngs

def report_processing_qc(analysis_dir, html_file):
    """
    Generate HTML report for processing statistics

    Arguments:
      analysis_dir (AnalysisDir): 
      html_file (str): destination path and file name for
        HTML report
    """
    # Initialise the HTML report
    processing_qc = Document("Processing report for %s" %
                             os.path.basename(analysis_dir.analysis_dir))
    processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
    processing_qc.add_css_rule("table { font-size: 80%;\n"
                               "        font-family: sans-serif; }")
    processing_qc.add_css_rule("td { text-align: right; }")
    # Add table of contents
    toc = processing_qc.add_section("Contents", name="toc")
    toc_list = List()
    toc.add(toc_list)
    # Per-lane statistics
    per_lane_stats_file = analysis_dir.params.per_lane_stats_file
    if per_lane_stats_file is None:
        per_lane_stats_file = "per_lane_statistics.info"
    if os.path.exists(per_lane_stats_file):
        per_lane_stats = processing_qc.add_section("Per-lane statistics",
                                                   name="per_lane_stats")
        stats = TabFile(per_lane_stats_file, first_line_is_header=True)
        tbl = Table(columns=stats.header())
        tbl.append_columns("Assigned/unassigned")
        for line in stats:
            n = tbl.add_row()
            for c in stats.header():
                if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                    value = pretty_print_reads(line[c])
                else:
                    value = line[c]
                tbl.set_value(n, c, value)
            tbl.set_value(
                n, "Assigned/unassigned",
                Img(
                    ustackedbar(
                        (line["Assigned reads"], line["Unassigned reads"]),
                        length=100,
                        height=15,
                        colors=('red', 'white'),
                        inline=True)))
        per_lane_stats.add(tbl)
        toc_list.add_item(Link("Per-lane statistics", per_lane_stats))
    # Per lane by sample statistics
    per_lane_sample_stats_file = "per_lane_sample_stats.info"
    if os.path.exists(per_lane_sample_stats_file):
        per_lane_sample_stats = processing_qc.add_section(
            "Per-lane statistics by sample", name="per_lane_sample_stats")
        lane_toc_list = List()
        per_lane_sample_stats.add(lane_toc_list)
        # Store the data for each lane
        with open("per_lane_sample_stats.info") as stats:
            lane_data = []
            for line in stats:
                if line.startswith("Lane "):
                    lane = int(line.split(' ')[-1])
                    lane_data.append({
                        'lane': lane,
                        'total_reads': None,
                        'samples': []
                    })
                elif line.startswith("Total reads = "):
                    total_reads = int(line.split('=')[-1].strip())
                    lane_data[-1]['total_reads'] = total_reads
                elif line.startswith("- "):
                    pname = line.split()[1].split('/')[0]
                    sname = line.split()[1].split('/')[1]
                    nreads = int(line.split()[2])
                    percreads = line.split()[3]
                    lane_data[-1]['samples'].append({
                        'pname': pname,
                        'sname': sname,
                        'nreads': nreads,
                        'percreads': percreads
                    })
        # Create a section and table for each lane
        for data in lane_data:
            lane = data['lane']
            max_reads = max([d['nreads'] for d in data['samples']])
            total_reads = data['total_reads']
            s = per_lane_sample_stats.add_subsection(
                "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane)
            lane_toc_list.add_item(Link("Lane %d" % lane, s))
            current_project = None
            tbl = Table(
                columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'),
                pname='Project',
                sname='Sample',
                nreads='Nreads',
                percreads='%reads',
                barplot='',
            )
            s.add(tbl)
            for sample in data['samples']:
                pname = sample['pname']
                sname = sample['sname']
                nreads = sample['nreads']
                percreads = sample['percreads']
                if pname == current_project:
                    pname = "&nbsp;"
                else:
                    current_project = pname
                barplot = ustackedbar((nreads, max_reads - nreads),
                                      length=100,
                                      height=5,
                                      colors=('black', 'lightgrey'),
                                      bbox=False,
                                      inline=True)
                tbl.add_row(pname=pname,
                            sname=sname,
                            nreads=pretty_print_reads(nreads),
                            percreads=percreads,
                            barplot=Img(barplot))
            tbl.add_row(pname="Total reads for lane %d" % lane,
                        nreads=pretty_print_reads(total_reads))
        toc_list.add_item(
            Link("Per-lane statistics by sample", per_lane_sample_stats),
            lane_toc_list)
    # Per fastq statistics
    stats_file = "statistics_full.info"
    if not os.path.exists(stats_file):
        if analysis_dir.params.stats_file is not None:
            stats_file = analysis_dir.params.stats_file
        else:
            stats_file = "statistics.info"
    if os.path.exists(stats_file):
        per_file_stats = processing_qc.add_section(
            "Per-file statistics by project", name="per_file_stats")
        project_toc_list = List()
        per_file_stats.add(project_toc_list)
        stats = TabFile(stats_file, first_line_is_header=True)
        projects = sorted(list(set([d['Project'] for d in stats])))
        lanes = filter(lambda c: c.startswith('L'), stats.header())
        sample = None
        for project in projects:
            subset = filter(lambda d: d['Project'] == project, stats)
            subset_lanes = filter(
                lambda l: reduce(lambda x, y: x or bool(y),
                                 [d[l] for d in subset], False), lanes)
            s = per_file_stats.add_subsection("%s" % project,
                                              name="per_file_stats_%s" %
                                              project)
            project_toc_list.add_item(Link("%s" % project, s))
            tbl = Table(columns=('Sample', 'Fastq', 'Size'))
            if subset_lanes:
                tbl.append_columns(*subset_lanes)
            tbl.append_columns('Barplot', 'Nreads')
            s.add(tbl)
            for line in subset:
                if sample == line['Sample']:
                    sname = "&nbsp;"
                else:
                    sample = line['Sample']
                    sname = sample
                data = {
                    'Sample':
                    sname,
                    'Fastq':
                    line['Fastq'],
                    'Size':
                    line['Size'],
                    'Nreads': (pretty_print_reads(line['Nreads'])
                               if line['Nreads'] != '' else '')
                }
                for l in subset_lanes:
                    data[l] = (pretty_print_reads(line[l])
                               if line[l] != '' else '')
                barplot = ustackedbar(filter(lambda n: n != '',
                                             [line[l] for l in subset_lanes]),
                                      length=100,
                                      height=10,
                                      colors=('grey', 'lightgrey'),
                                      bbox=True,
                                      inline=True)
                data['Barplot'] = Img(barplot)
                tbl.add_row(**data)
        toc_list.add_item(
            Link("Per-file statistics by project", per_file_stats),
            project_toc_list)
    # Write the processing QC summary file
    processing_qc.write(html_file)

Exemplo n.º 8

0

Exibir arquivo

    # line then use "XLS_<input_name>.xls"
    if len(args) == 2:
        xls_out = args[1]
    else:
        # MACS output file might already have an .xls extension
        # but we'll add an explicit .xls extension
        xls_out = "XLS_" + os.path.splitext(
            os.path.basename(macs_in))[0] + ".xls"
    print "Input file: %s" % macs_in
    print "Output XLS: %s" % xls_out

    # Extract the header from the MACS and feed actual data to
    # TabFile object
    header = []
    data = TabFile(column_names=[
        'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)',
        'fold_enrichment', 'FDR(%)'
    ])
    fp = open(macs_in, 'r')
    for line in fp:
        if line.startswith('#') or line.strip() == '':
            # Header line
            header.append(line.strip())
        else:
            # Data
            data.append(tabdata=line.strip())
    fp.close()

    # Temporarily remove first line
    header_line = str(data[0])
    del (data[0])

Exemplo n.º 9

0

Exibir arquivo

Arquivo: stats.py Projeto: zacapte/auto_process_ngs

 def _get_data(self, filen=None):
     """
     Collect statistics for FASTQ outputs from an Illumina run
     """
     # Collect FASTQ files
     fastqstats = []
     for project in self._illumina_data.projects:
         for sample in project.samples:
             for fastq in sample.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(sample.dirn, fastq),
                                project.name, sample.name))
     # Gather same information for undetermined reads (if present)
     if self._illumina_data.undetermined is not None:
         for lane in self._illumina_data.undetermined.samples:
             for fastq in lane.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(lane.dirn, fastq),
                                self._illumina_data.undetermined.name,
                                lane.name))
     # Collect the data for each file
     if self._n_processors > 1:
         # Multiple cores
         pool = Pool(self._n_processors)
         results = pool.map(collect_fastq_data, fastqstats)
         pool.close()
         pool.join()
     else:
         # Single core
         results = map(collect_fastq_data, fastqstats)
     # Set up tabfile to hold pre-existing data
     if filen is not None:
         existing_stats = TabFile(filen, first_line_is_header=True)
     else:
         existing_stats = None
     # Set up class to hold all collected data
     self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                         'Size', 'Nreads', 'Paired_end',
                                         'Read_number'))
     # Split result sets into R1 and R2
     results_r1 = filter(lambda f: f.read_number == 1, results)
     results_r2 = filter(lambda f: f.read_number == 2, results)
     # Determine which lanes are present and append
     # columns for each
     lanes = set()
     for fastq in results_r1:
         logger.debug("-- %s: lanes %s" %
                      (fastq.name, ','.join([str(l) for l in fastq.lanes])))
         for lane in fastq.lanes:
             lanes.add(lane)
     # Add lane numbers from pre-existing stats file
     if existing_stats is not None:
         for c in existing_stats.header():
             if c.startswith('L'):
                 lanes.add(int(c[1:]))
     self._lanes = sorted(list(lanes))
     logger.debug("Lanes found: %s" %
                  ','.join([str(l) for l in self._lanes]))
     for lane in self._lanes:
         self._stats.appendColumn("L%s" % lane)
     # Copy pre-existing stats into new tabfile
     if existing_stats:
         for line in existing_stats:
             data = [
                 line['Project'], line['Sample'], line['Fastq'],
                 line['Size'], line['Nreads'], line['Paired_end'],
                 line['Read_number']
             ]
             for lane in lanes:
                 try:
                     data.append(line["L%s" % lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
     # Copy reads per lane from R1 FASTQs into R2
     for r2_fastq in results_r2:
         # Get corresponding R1 name
         logger.debug("-- Fastq R2: %s" % r2_fastq.name)
         r1_fastq_name = IlluminaFastq(r2_fastq.name)
         r1_fastq_name.read_number = 1
         r1_fastq_name = str(r1_fastq_name)
         logger.debug("--    -> R1: %s" % r1_fastq_name)
         # Locate corresponding data
         r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                           results_r1)[0]
         r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
     # Write the data into the tabfile
     paired_end = ('Y' if self._illumina_data.paired_end else 'N')
     for fastq in results:
         # Check for existing entry
         existing_entry = False
         for line in self._stats:
             if (line['Project'] == fastq.project
                     and line['Sample'] == fastq.sample
                     and line['Fastq'] == fastq.name):
                 # Overwrite the existing entry
                 existing_entry = True
                 break
         # Write the data
         if not existing_entry:
             # Append new entry
             data = [
                 fastq.project, fastq.sample, fastq.name,
                 bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                 paired_end, fastq.read_number
             ]
             for lane in lanes:
                 try:
                     data.append(fastq.reads_by_lane[lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
         else:
             # Overwrite existing entry
             logging.warning("Overwriting exisiting entry for "
                             "%s/%s/%s" %
                             (fastq.project, fastq.sample, fastq.name))
             line['Size'] = bcf_utils.format_file_size(fastq.fsize)
             line['Nreads'] = fastq.nreads
             line['Paired_end'] = paired_end
             line['Read_number'] = fastq.read_number
             for lane in lanes:
                 lane_name = "L%d" % lane
                 try:
                     line[lane_name] = fastq.reads_by_lane[lane]
                 except:
                     line[lane_name] = ''

Exemplo n.º 10

0

Exibir arquivo

Arquivo: stats.py Projeto: zacapte/auto_process_ngs

    def report_per_lane_summary_stats(self, out_file=None, fp=None):
        """
        Report summary of total and unassigned reads per-lane

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Set up TabFile to hold the data collected
        per_lane_stats = TabFile(column_names=('Lane', 'Total reads',
                                               'Assigned reads',
                                               'Unassigned reads', '%assigned',
                                               '%unassigned'))
        # Initialise counts for each lane
        assigned = {}
        unassigned = {}
        for lane in self.lane_names:
            assigned[lane] = 0
            unassigned[lane] = 0
        # Count assigned and unassigned (= undetermined) reads
        for line in filter(
                lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[
                    'Fastq']).is_index_read, self._stats):
            if line['Project'] != 'Undetermined_indices':
                counts = assigned
            else:
                counts = unassigned
            for lane in self.lane_names:
                if line[lane]:
                    try:
                        counts[lane] += line[lane]
                    except KeyError:
                        counts[lane] = line[lane]
        # Write out data for each lane
        for lane in self.lane_names:
            lane_number = int(lane[1:])
            assigned_reads = assigned[lane]
            try:
                unassigned_reads = unassigned[lane]
            except KeyError:
                # lane doesn't have any unassigned reads
                unassigned_reads = 0
            total_reads = assigned_reads + unassigned_reads
            if total_reads > 0:
                percent_assigned = float(assigned_reads)/ \
                                   float(total_reads)*100.0
                percent_unassigned = float(unassigned_reads)/ \
                                     float(total_reads)*100.0
            else:
                percent_assigned = 0.0
                percent_unassigned = 0.0
            per_lane_stats.append(data=("Lane %d" % lane_number, total_reads,
                                        assigned_reads, unassigned_reads,
                                        "%.2f" % percent_assigned,
                                        "%.2f" % percent_unassigned))
        # Write to file
        per_lane_stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

Exemplo n.º 11

0

Exibir arquivo

    # Remove the working directory
    shutil.rmtree(working_dir)

    # Report the stats
    if args.stats_file is not None:
        # Output column names
        stats_file = os.path.abspath(args.stats_file)
        nreads_col = "Nreads%s" % ('' if args.suffix is None else args.suffix)
        umis_col = "Distinct_UMIs%s" % ('' if args.suffix is None else
                                        args.suffix)
        if not (os.path.isfile(stats_file) and args.append):
            # Create new stats file
            if well_list is not None:
                # Initialise barcode and sample names from well list
                stats_data = TabFile(column_names=('Barcode', 'Sample'))
                for barcode in well_list.barcodes():
                    stats_data.append(data=(barcode,
                                            well_list.sample(barcode)))
            else:
                # Barcodes from collected data
                stats_data = TabFile(column_names=('Barcode', ))
                for barcode in stats.barcodes():
                    stats_data.append(data=(barcode, ))
        else:
            # Append to an existing file
            stats_data = TabFile(filen=stats_file, first_line_is_header=True)
        # Add new columns of data
        stats_data.appendColumn(nreads_col)
        stats_data.appendColumn(umis_col)
        # Populate columns

Exemplo n.º 12

0

Exibir arquivo

Arquivo: processing.py Projeto: nandr0id/auto_process_ngs

def report_processing_qc(analysis_dir, html_file):
    """
    Generate HTML report for processing statistics

    Arguments:
      analysis_dir (AutoProcess): AutoProcess instance for
        the directory to report the processing from
      html_file (str): destination path and file name for
        HTML report
    """
    # Initialise the HTML report
    processing_qc = Document("Processing report for %s" %
                             os.path.basename(analysis_dir.analysis_dir))
    processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
    processing_qc.add_css_rule("table { font-size: 80%;\n"
                               "        font-family: sans-serif; }")
    processing_qc.add_css_rule("td { text-align: right; }")
    processing_qc.add_css_rule("p.warning { padding: 5px;\n"
                               "            border: solid 1px red;\n"
                               "            background-color: F5BCA9;\n"
                               "            color: red;\n"
                               "            font-weight: bold;\n"
                               "            border-radius: 10px;\n"
                               "            display: inline-block; }")
    processing_qc.add_css_rule(".warnings { padding: 2px;\n"
                               "            border: solid 3px red;\n"
                               "            background-color: F5BCA9;\n"
                               "            color: red;\n"
                               "            font-weight: bold;\n"
                               "            margin: 10px;\n"
                               "            border-radius: 10px;\n"
                               "            display: inline-block; }")
    processing_qc.add_css_rule("img { vertical-align: middle; }")
    processing_qc.add_css_rule(".hide { display: none; }")
    # Add table of contents
    toc = processing_qc.add_section("Contents", name="toc")
    toc_list = List()
    toc.add(toc_list)
    # Add warnings section
    # This will be hidden if there are no issues
    status = True
    warnings = processing_qc.add_section(css_classes=("warnings", ))
    warnings.add(
        Para(WarningIcon(size=50),
             "There are issues with one or more lanes or samples"))
    # Per-lane statistics
    per_lane_stats_file = analysis_dir.params.per_lane_stats_file
    if per_lane_stats_file is None:
        per_lane_stats_file = "per_lane_statistics.info"
    per_lane_stats_file = get_absolute_file_path(
        per_lane_stats_file, base=analysis_dir.analysis_dir)
    if os.path.exists(per_lane_stats_file):
        per_lane_stats = processing_qc.add_section("Per-lane statistics",
                                                   name="per_lane_stats")
        stats = TabFile(per_lane_stats_file, first_line_is_header=True)
        tbl = Table(columns=stats.header())
        tbl.append_columns("Assigned/unassigned")
        for line in stats:
            n = tbl.add_row()
            for c in stats.header():
                if c in ("Total reads", "Assigned reads", "Unassigned reads"):
                    value = pretty_print_reads(line[c])
                else:
                    value = line[c]
                tbl.set_value(n, c, value)
            tbl.set_value(
                n, "Assigned/unassigned",
                Img(
                    ustackedbar(
                        (line["Assigned reads"], line["Unassigned reads"]),
                        length=100,
                        height=15,
                        colors=('red', 'white'),
                        inline=True)))
        per_lane_stats.add(tbl)
        toc_list.add_item(Link("Per-lane statistics", per_lane_stats))
    # Per lane by sample statistics
    per_lane_sample_stats_file = get_absolute_file_path(
        "per_lane_sample_stats.info", base=analysis_dir.analysis_dir)
    if os.path.exists(per_lane_sample_stats_file):
        per_lane_sample_stats = processing_qc.add_section(
            "Per-lane statistics by sample", name="per_lane_sample_stats")
        lane_toc_list = List()
        per_lane_sample_stats.add(lane_toc_list)
        # Store the data for each lane
        lane_data = list()
        with open(per_lane_sample_stats_file, 'r') as stats:
            for line in stats:
                if line.startswith("Lane "):
                    lane = int(line.split(' ')[-1])
                    lane_data.append({
                        'lane': lane,
                        'total_reads': None,
                        'samples': []
                    })
                elif line.startswith("Total reads = "):
                    total_reads = int(line.split('=')[-1].strip())
                    lane_data[-1]['total_reads'] = total_reads
                elif line.startswith("- "):
                    pname = line.split()[1].split('/')[0]
                    sname = line.split()[1].split('/')[1]
                    nreads = int(line.split()[2])
                    percreads = line.split()[3]
                    lane_data[-1]['samples'].append({
                        'pname': pname,
                        'sname': sname,
                        'nreads': nreads,
                        'percreads': percreads
                    })
        # Create a section and table for each lane
        for data in lane_data:
            lane = data['lane']
            s = per_lane_sample_stats.add_subsection(
                "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane)
            # Check for problems
            has_warnings = False
            if not data['samples']:
                # No samples reported
                s.add(
                    Para(WarningIcon(),
                         "No samples reported for this lane",
                         css_classes=('warning', )))
                has_warnings = True
            elif min([d['nreads'] for d in data['samples']]) == 0:
                # There are samples with no reads
                s.add(
                    Para(WarningIcon(),
                         "One or more samples with no reads",
                         css_classes=('warning', )))
                has_warnings = True
            # Add link to lane for lane ToC
            link = Link("Lane %d" % lane, s)
            if not has_warnings:
                lane_toc_list.add_item(link)
            else:
                lane_toc_list.add_item(WarningIcon(), link)
                status = False
            # Write out the data, if there is any
            if not data['samples']:
                continue
            max_reads = max([d['nreads'] for d in data['samples']])
            total_reads = data['total_reads']
            current_project = None
            tbl = Table(
                columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'),
                pname='Project',
                sname='Sample',
                nreads='Nreads',
                percreads='%reads',
                barplot='',
            )
            s.add(tbl)
            # Sort the sample data into order of sample name
            samples = sorted([s for s in data['samples']],
                             key=lambda s: split_sample_name(s['sname']))
            # Write the table
            for sample in samples:
                pname = sample['pname']
                sname = sample['sname']
                nreads = sample['nreads']
                percreads = sample['percreads']
                if pname == current_project:
                    pname = "&nbsp;"
                else:
                    current_project = pname
                barplot = ustackedbar((nreads, max_reads - nreads),
                                      length=100,
                                      height=5,
                                      colors=('black', 'lightgrey'),
                                      bbox=False,
                                      inline=True)
                if nreads == 0:
                    sname = Para(WarningIcon(), sname)
                tbl.add_row(pname=pname,
                            sname=sname,
                            nreads=pretty_print_reads(nreads),
                            percreads=percreads,
                            barplot=Img(barplot))
            tbl.add_row(pname="Total reads for lane %d" % lane,
                        nreads=pretty_print_reads(total_reads))
        # Add link to section from main ToC
        toc_list.add_item(
            Link("Per-lane statistics by sample", per_lane_sample_stats),
            lane_toc_list)
    # Per fastq statistics
    stats_file = get_absolute_file_path("statistics_full.info",
                                        base=analysis_dir.analysis_dir)
    if not os.path.exists(stats_file):
        if analysis_dir.params.stats_file is not None:
            stats_file = analysis_dir.params.stats_file
        else:
            stats_file = "statistics.info"
    stats_file = get_absolute_file_path(stats_file,
                                        base=analysis_dir.analysis_dir)
    if os.path.exists(stats_file):
        per_file_stats = processing_qc.add_section(
            "Per-file statistics by project", name="per_file_stats")
        project_toc_list = List()
        per_file_stats.add(project_toc_list)
        stats = TabFile(stats_file, first_line_is_header=True)
        projects = sorted(list(set([d['Project'] for d in stats])))
        lanes = filter(lambda c: c.startswith('L'), stats.header())
        sample = None
        for project in projects:
            # Get subset of lines for this project
            subset = sorted(filter(lambda d: d['Project'] == project, stats),
                            key=lambda l: split_sample_name(l['Sample']))
            # Work out which lanes are included
            subset_lanes = filter(
                lambda l: reduce(lambda x, y: x or bool(y),
                                 [d[l] for d in subset], False), lanes)
            # Add a new section for this project
            s = per_file_stats.add_subsection("%s" % project,
                                              name="per_file_stats_%s" %
                                              project)
            # Check for problems
            has_warnings = False
            for line in subset:
                nreads = filter(lambda n: n != '',
                                [line[l] for l in subset_lanes])
                if not nreads or min(nreads) == 0:
                    s.add(
                        Para(WarningIcon(), "One or more Fastqs with zero "
                             "read counts in one or lanes",
                             css_classes=('warning', )))
                    has_warnings = True
                    break
            # Add link to project from ToC
            link = Link("%s" % project, s)
            if not has_warnings:
                project_toc_list.add_item(link)
            else:
                project_toc_list.add_item(WarningIcon(), link)
                status = False
            # Build the data of data
            tbl = Table(columns=('Sample', 'Fastq', 'Size'))
            if subset_lanes:
                tbl.append_columns(*subset_lanes)
            tbl.append_columns('Barplot', 'Nreads')
            s.add(tbl)
            for line in subset:
                if sample == line['Sample']:
                    sname = "&nbsp;"
                else:
                    sample = line['Sample']
                    sname = sample
                data = {
                    'Sample':
                    sname,
                    'Fastq':
                    line['Fastq'],
                    'Size':
                    line['Size'],
                    'Nreads': (pretty_print_reads(line['Nreads'])
                               if line['Nreads'] != '' else '')
                }
                for l in subset_lanes:
                    data[l] = (pretty_print_reads(line[l])
                               if line[l] != '' else '')
                nreads = filter(lambda n: n != '',
                                [line[l] for l in subset_lanes])
                if not nreads:
                    nreads = [
                        0,
                    ]
                if min(nreads) == 0:
                    # Add warning icon to Fastq with no reads in
                    # at least one lane
                    data['Fastq'] = Para(WarningIcon(), data['Fastq'])
                barplot = ustackedbar(nreads,
                                      length=100,
                                      height=10,
                                      colors=('grey', 'lightgrey'),
                                      bbox=True,
                                      inline=True)
                data['Barplot'] = Img(barplot)
                tbl.add_row(**data)
        toc_list.add_item(
            Link("Per-file statistics by project", per_file_stats),
            project_toc_list)
    # Set the visibility of the warning header
    if status:
        warnings.add_css_classes("hide")
    # Add an non-visible section that the publisher can
    # read to determine if there were problems
    s = processing_qc.add_section(name="status", css_classes=("hide", ))
    s.add("Status: %s" % ('OK' if status else 'WARNINGS', ))
    # Write the processing QC summary file
    processing_qc.write(html_file)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: reporting.py Projeto: fls-bioinformatics-core/auto_process_ngs

 def add_per_fastq_statistics(self):
     """
     Add a section with the per-Fastq statistics
     """
     # Per fastq statistics
     if not os.path.exists(self._stats_file):
         logger.debug("No per-Fastq statistics file found")
         return
     per_file_stats = self.add_section("Per-file statistics by project",
                                       name="per_file_stats")
     project_toc_list = List()
     per_file_stats.add(project_toc_list)
     stats = TabFile(self._stats_file, first_line_is_header=True)
     projects = sorted(list(set([d['Project'] for d in stats])))
     lanes = [c for c in stats.header() if c.startswith('L')]
     sample = None
     for project in projects:
         # Get subset of lines for this project
         subset = sorted([d for d in stats if d['Project'] == project],
                         key=lambda l: split_sample_name(l['Sample']))
         # Determine which lanes this project appears in
         subset_lanes = []
         for l in lanes:
             for d in subset:
                 if d[l]:
                     subset_lanes.append(l)
                     break
         # Add a new section for this project
         s = per_file_stats.add_subsection("%s" % project,
                                           name="per_file_stats_%s" %
                                           project)
         # Check for problems
         has_warnings = False
         for line in subset:
             nreads = [line[l] for l in subset_lanes if line[l] != '']
             if not nreads or min(nreads) == 0:
                 s.add(
                     self.warning("One or more Fastqs with zero read "
                                  "counts in one or more lanes"))
                 has_warnings = True
                 break
         # Add link to project from ToC
         link = Link("%s" % project, s)
         if not has_warnings:
             project_toc_list.add_item(link)
         else:
             project_toc_list.add_item(WarningIcon(), link)
             self.flag_warnings()
         # Build the data of data
         tbl = Table(columns=('Sample', 'Fastq', 'Size'))
         if subset_lanes:
             tbl.append_columns(*subset_lanes)
         tbl.append_columns('Barplot', 'Nreads')
         s.add(tbl)
         for line in subset:
             if sample == line['Sample']:
                 sname = "&nbsp;"
             else:
                 sample = line['Sample']
                 sname = sample
             data = {
                 'Sample':
                 sname,
                 'Fastq':
                 line['Fastq'],
                 'Size':
                 line['Size'],
                 'Nreads': (pretty_print_reads(line['Nreads'])
                            if line['Nreads'] != '' else '')
             }
             for l in subset_lanes:
                 data[l] = (pretty_print_reads(line[l])
                            if line[l] != '' else '')
             nreads = [line[l] for l in subset_lanes if line[l] != '']
             if not nreads:
                 nreads = [
                     0,
                 ]
             if min(nreads) == 0:
                 # Add warning icon to Fastq with no reads in
                 # at least one lane
                 data['Fastq'] = Para(WarningIcon(), data['Fastq'])
             barplot = ustackedbar(nreads,
                                   length=100,
                                   height=10,
                                   colors=('grey', 'lightgrey'),
                                   bbox=True,
                                   inline=True)
             data['Barplot'] = Img(barplot)
             tbl.add_row(**data)
     # Add to table of contents
     self.add_to_toc("Per-file statistics by project", per_file_stats,
                     project_toc_list)