# Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error("input XLS comes from MACS %s, this version only handles 1.4" %
                      macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'],reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'],reverse=True)

    # Restore first line
    data.insert(0,tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1,len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns(['order','chr','start','end','length','summit','tags',
                                '-10*log10(pvalue)','fold_enrichment','FDR(%)'])

    # Legnds text
    legends_text = """order\tSorting order Pvalue and FE
chr\tChromosome location of binding region
start\tStart coordinate of binding region
end\tEnd coordinate of binding region
summit-100\tSummit - 100bp
summit+100\tSummit + 100bp
Пример #2
0
class FastqStatistics:
    """
    Class for collecting and reporting stats on Illumina FASTQs

    Given a directory with fastq(.gz) files arranged in the same
    structure as the output from bcl2fastq or bcl2fastq2,
    collects statistics for each file and provides methods for
    reporting different aspects.

    Example usage:

    >>> from IlluminaData import IlluminaData
    >>> data = IlluminaData('120117_BLAH_JSHJHXXX','bcl2fastq')
    >>> stats = FastqStatistics(data)
    >>> stats.report_basic_stats('basic_stats.out')

    """
    def __init__(self, illumina_data, n_processors=1, add_to=None):
        """
        Create a new FastqStatistics instance

        Arguments:
          illumina_data: populated IlluminaData object describing the
            run.
          n_processors: number of processors to use (if >1 then uses
            the multiprocessing library to run the statistics gathering
            using multiple cores).
          add_to: optional, add the data to that from an existing
            statistics file
        """
        self._illumina_data = illumina_data
        self._n_processors = n_processors
        self._stats = None
        self._lane_names = []
        self._get_data(filen=add_to)

    def _get_data(self, filen=None):
        """
        Collect statistics for FASTQ outputs from an Illumina run
        """
        # Collect FASTQ files
        fastqstats = []
        for project in self._illumina_data.projects:
            for sample in project.samples:
                for fastq in sample.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(sample.dirn, fastq),
                                   project.name, sample.name))
        # Gather same information for undetermined reads (if present)
        if self._illumina_data.undetermined is not None:
            for lane in self._illumina_data.undetermined.samples:
                for fastq in lane.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(lane.dirn, fastq),
                                   self._illumina_data.undetermined.name,
                                   lane.name))
        # Collect the data for each file
        if self._n_processors > 1:
            # Multiple cores
            pool = Pool(self._n_processors)
            results = pool.map(collect_fastq_data, fastqstats)
            pool.close()
            pool.join()
        else:
            # Single core
            results = map(collect_fastq_data, fastqstats)
        # Set up tabfile to hold pre-existing data
        if filen is not None:
            existing_stats = TabFile(filen, first_line_is_header=True)
        else:
            existing_stats = None
        # Set up class to hold all collected data
        self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                            'Size', 'Nreads', 'Paired_end',
                                            'Read_number'))
        # Split result sets into R1 and R2
        results_r1 = filter(lambda f: f.read_number == 1, results)
        results_r2 = filter(lambda f: f.read_number == 2, results)
        # Determine which lanes are present and append
        # columns for each
        lanes = set()
        for fastq in results_r1:
            logger.debug("-- %s: lanes %s" %
                         (fastq.name, ','.join([str(l) for l in fastq.lanes])))
            for lane in fastq.lanes:
                lanes.add(lane)
        # Add lane numbers from pre-existing stats file
        if existing_stats is not None:
            for c in existing_stats.header():
                if c.startswith('L'):
                    lanes.add(int(c[1:]))
        self._lanes = sorted(list(lanes))
        logger.debug("Lanes found: %s" %
                     ','.join([str(l) for l in self._lanes]))
        for lane in self._lanes:
            self._stats.appendColumn("L%s" % lane)
        # Copy pre-existing stats into new tabfile
        if existing_stats:
            for line in existing_stats:
                data = [
                    line['Project'], line['Sample'], line['Fastq'],
                    line['Size'], line['Nreads'], line['Paired_end'],
                    line['Read_number']
                ]
                for lane in lanes:
                    try:
                        data.append(line["L%s" % lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
        # Copy reads per lane from R1 FASTQs into R2
        for r2_fastq in results_r2:
            # Get corresponding R1 name
            logger.debug("-- Fastq R2: %s" % r2_fastq.name)
            r1_fastq_name = IlluminaFastq(r2_fastq.name)
            r1_fastq_name.read_number = 1
            r1_fastq_name = str(r1_fastq_name)
            logger.debug("--    -> R1: %s" % r1_fastq_name)
            # Locate corresponding data
            r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                              results_r1)[0]
            r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
        # Write the data into the tabfile
        paired_end = ('Y' if self._illumina_data.paired_end else 'N')
        for fastq in results:
            # Check for existing entry
            existing_entry = False
            for line in self._stats:
                if (line['Project'] == fastq.project
                        and line['Sample'] == fastq.sample
                        and line['Fastq'] == fastq.name):
                    # Overwrite the existing entry
                    existing_entry = True
                    break
            # Write the data
            if not existing_entry:
                # Append new entry
                data = [
                    fastq.project, fastq.sample, fastq.name,
                    bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                    paired_end, fastq.read_number
                ]
                for lane in lanes:
                    try:
                        data.append(fastq.reads_by_lane[lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
            else:
                # Overwrite existing entry
                logging.warning("Overwriting exisiting entry for "
                                "%s/%s/%s" %
                                (fastq.project, fastq.sample, fastq.name))
                line['Size'] = bcf_utils.format_file_size(fastq.fsize)
                line['Nreads'] = fastq.nreads
                line['Paired_end'] = paired_end
                line['Read_number'] = fastq.read_number
                for lane in lanes:
                    lane_name = "L%d" % lane
                    try:
                        line[lane_name] = fastq.reads_by_lane[lane]
                    except:
                        line[lane_name] = ''

    @property
    def lane_names(self):
        """
        Return list of lane names (e.g. ['L1','L2',...])
        """
        return [("L%d" % l) for l in self._lanes]

    @property
    def raw(self):
        """
        Return the 'raw' statistics TabFile instance
        """
        return self._stats

    def report_full_stats(self, out_file=None, fp=None):
        """
        Report all statistics gathered for all FASTQs

        Essentially a dump of all the data.

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        self._stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_basic_stats(self, out_file=None, fp=None):
        """
        Report the 'basic' statistics

        For each FASTQ file, report the following information:

        - Project name
        - Sample name
        - FASTQ file name (without leading directory)
        - Size (human-readable)
        - Nreads (number of reads)
        - Paired_end ('Y' for paired-end, 'N' for single-end)

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size',
                                      'Nreads', 'Paired_end'))
        for line in self._stats:
            data = [line[c] for c in stats.header()]
            stats.append(data=data)
        stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_sample_stats(self, out_file=None, fp=None):
        """
        Report of reads per sample in each lane

        Reports the number of reads for each sample in each
        lane plus the total reads for each lane.

        Example output:

        Lane 1
        Total reads = 182851745
        - KatyDobbs/KD-K1      79888058        43.7%
        - KatyDobbs/KD-K3      97854292        53.5%
        - Undetermined_indices/lane1       5109395 2.8%
        ...

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        lanes = self.lane_names
        for lane in lanes:
            lane_number = int(lane[1:])
            samples = filter(lambda x: x['Read_number'] == 1 and bool(x[lane]),
                             self._stats)
            try:
                total_reads = sum([int(s[lane]) for s in samples])
            except Exception as ex:
                for s in samples:
                    try:
                        int(s[lane])
                    except ValueError:
                        logging.critical("Bad value for read count in "
                                         "lane %s sample %s: '%s'" %
                                         (lane, s['Sample'], s[lane]))
                raise ex
            fpp.write("\nLane %d\n" % lane_number)
            fpp.write("Total reads = %d\n" % total_reads)
            for sample in samples:
                sample_name = "%s/%s" % (sample['Project'], sample['Sample'])
                nreads = float(sample[lane])
                fpp.write("- %s\t%d\t%.1f%%\n" %
                          (sample_name, nreads, nreads / total_reads * 100.0))
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_summary_stats(self, out_file=None, fp=None):
        """
        Report summary of total and unassigned reads per-lane

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Set up TabFile to hold the data collected
        per_lane_stats = TabFile(column_names=('Lane', 'Total reads',
                                               'Assigned reads',
                                               'Unassigned reads', '%assigned',
                                               '%unassigned'))
        # Initialise counts for each lane
        assigned = {}
        unassigned = {}
        for lane in self.lane_names:
            assigned[lane] = 0
            unassigned[lane] = 0
        # Count assigned and unassigned (= undetermined) reads
        for line in filter(
                lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[
                    'Fastq']).is_index_read, self._stats):
            if line['Project'] != 'Undetermined_indices':
                counts = assigned
            else:
                counts = unassigned
            for lane in self.lane_names:
                if line[lane]:
                    try:
                        counts[lane] += line[lane]
                    except KeyError:
                        counts[lane] = line[lane]
        # Write out data for each lane
        for lane in self.lane_names:
            lane_number = int(lane[1:])
            assigned_reads = assigned[lane]
            try:
                unassigned_reads = unassigned[lane]
            except KeyError:
                # lane doesn't have any unassigned reads
                unassigned_reads = 0
            total_reads = assigned_reads + unassigned_reads
            if total_reads > 0:
                percent_assigned = float(assigned_reads)/ \
                                   float(total_reads)*100.0
                percent_unassigned = float(unassigned_reads)/ \
                                     float(total_reads)*100.0
            else:
                percent_assigned = 0.0
                percent_unassigned = 0.0
            per_lane_stats.append(data=("Lane %d" % lane_number, total_reads,
                                        assigned_reads, unassigned_reads,
                                        "%.2f" % percent_assigned,
                                        "%.2f" % percent_unassigned))
        # Write to file
        per_lane_stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()
Пример #3
0
    # Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error(
            "input XLS comes from MACS %s, this version only handles 1.4" %
            macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'], reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'], reverse=True)

    # Restore first line
    data.insert(0, tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1, len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns([
        'order', 'chr', 'start', 'end', 'length', 'summit', 'tags',
        '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)'
    ])

    # Legnds text
    legends_text = """order\tSorting order Pvalue and FE
chr\tChromosome location of binding region
start\tStart coordinate of binding region
end\tEnd coordinate of binding region
Пример #4
0
     if well_list is not None:
         # Initialise barcode and sample names from well list
         stats_data = TabFile(column_names=('Barcode', 'Sample'))
         for barcode in well_list.barcodes():
             stats_data.append(data=(barcode,
                                     well_list.sample(barcode)))
     else:
         # Barcodes from collected data
         stats_data = TabFile(column_names=('Barcode', ))
         for barcode in stats.barcodes():
             stats_data.append(data=(barcode, ))
 else:
     # Append to an existing file
     stats_data = TabFile(filen=stats_file, first_line_is_header=True)
 # Add new columns of data
 stats_data.appendColumn(nreads_col)
 stats_data.appendColumn(umis_col)
 # Populate columns
 for data_line in stats_data:
     barcode = data_line['Barcode']
     try:
         data_line[nreads_col] = stats.nreads(barcode)
         data_line[umis_col] = len(stats.distinct_umis(barcode))
     except KeyError:
         data_line[nreads_col] = 0
         data_line[umis_col] = 0
 # Deal with 'unassigned' reads
 if args.unassigned:
     # Count reads for barcodes not in list
     unassigned_reads = 0
     unassigned_umis = set()