Python TabFile.appendColumn примеры использования

Язык программирования: Python

Пространство имен/Пакет: bcftbx.TabFile

Класс/Тип: TabFile

Метод/Функция: appendColumn

Примеров на hotexamples.com: 4

Python TabFile.appendColumn - 4 примера найдено. Это лучшие примеры Python кода для bcftbx.TabFile.TabFile.appendColumn, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TabFile(13)

append(8)

header(8)

__init__(4)

write(4)

appendColumn(3)

lookup(2)

sort(2)

insert(1)

reorderColumns(1)

Пример #1

Показать файл

Файл: make_macs_xls.py Проект: fls-bioinformatics-core/genomics

    # Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error("input XLS comes from MACS %s, this version only handles 1.4" %
                      macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'],reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'],reverse=True)

    # Restore first line
    data.insert(0,tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1,len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns(['order','chr','start','end','length','summit','tags',
                                '-10*log10(pvalue)','fold_enrichment','FDR(%)'])

    # Legnds text
    legends_text = """order\tSorting order Pvalue and FE
chr\tChromosome location of binding region
start\tStart coordinate of binding region
end\tEnd coordinate of binding region
summit-100\tSummit - 100bp
summit+100\tSummit + 100bp

Пример #2

Показать файл

Файл: stats.py Проект: zacapte/auto_process_ngs

class FastqStatistics:
    """
    Class for collecting and reporting stats on Illumina FASTQs

    Given a directory with fastq(.gz) files arranged in the same
    structure as the output from bcl2fastq or bcl2fastq2,
    collects statistics for each file and provides methods for
    reporting different aspects.

    Example usage:

    >>> from IlluminaData import IlluminaData
    >>> data = IlluminaData('120117_BLAH_JSHJHXXX','bcl2fastq')
    >>> stats = FastqStatistics(data)
    >>> stats.report_basic_stats('basic_stats.out')

    """
    def __init__(self, illumina_data, n_processors=1, add_to=None):
        """
        Create a new FastqStatistics instance

        Arguments:
          illumina_data: populated IlluminaData object describing the
            run.
          n_processors: number of processors to use (if >1 then uses
            the multiprocessing library to run the statistics gathering
            using multiple cores).
          add_to: optional, add the data to that from an existing
            statistics file
        """
        self._illumina_data = illumina_data
        self._n_processors = n_processors
        self._stats = None
        self._lane_names = []
        self._get_data(filen=add_to)

    def _get_data(self, filen=None):
        """
        Collect statistics for FASTQ outputs from an Illumina run
        """
        # Collect FASTQ files
        fastqstats = []
        for project in self._illumina_data.projects:
            for sample in project.samples:
                for fastq in sample.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(sample.dirn, fastq),
                                   project.name, sample.name))
        # Gather same information for undetermined reads (if present)
        if self._illumina_data.undetermined is not None:
            for lane in self._illumina_data.undetermined.samples:
                for fastq in lane.fastq:
                    fastqstats.append(
                        FastqStats(os.path.join(lane.dirn, fastq),
                                   self._illumina_data.undetermined.name,
                                   lane.name))
        # Collect the data for each file
        if self._n_processors > 1:
            # Multiple cores
            pool = Pool(self._n_processors)
            results = pool.map(collect_fastq_data, fastqstats)
            pool.close()
            pool.join()
        else:
            # Single core
            results = map(collect_fastq_data, fastqstats)
        # Set up tabfile to hold pre-existing data
        if filen is not None:
            existing_stats = TabFile(filen, first_line_is_header=True)
        else:
            existing_stats = None
        # Set up class to hold all collected data
        self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                            'Size', 'Nreads', 'Paired_end',
                                            'Read_number'))
        # Split result sets into R1 and R2
        results_r1 = filter(lambda f: f.read_number == 1, results)
        results_r2 = filter(lambda f: f.read_number == 2, results)
        # Determine which lanes are present and append
        # columns for each
        lanes = set()
        for fastq in results_r1:
            logger.debug("-- %s: lanes %s" %
                         (fastq.name, ','.join([str(l) for l in fastq.lanes])))
            for lane in fastq.lanes:
                lanes.add(lane)
        # Add lane numbers from pre-existing stats file
        if existing_stats is not None:
            for c in existing_stats.header():
                if c.startswith('L'):
                    lanes.add(int(c[1:]))
        self._lanes = sorted(list(lanes))
        logger.debug("Lanes found: %s" %
                     ','.join([str(l) for l in self._lanes]))
        for lane in self._lanes:
            self._stats.appendColumn("L%s" % lane)
        # Copy pre-existing stats into new tabfile
        if existing_stats:
            for line in existing_stats:
                data = [
                    line['Project'], line['Sample'], line['Fastq'],
                    line['Size'], line['Nreads'], line['Paired_end'],
                    line['Read_number']
                ]
                for lane in lanes:
                    try:
                        data.append(line["L%s" % lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
        # Copy reads per lane from R1 FASTQs into R2
        for r2_fastq in results_r2:
            # Get corresponding R1 name
            logger.debug("-- Fastq R2: %s" % r2_fastq.name)
            r1_fastq_name = IlluminaFastq(r2_fastq.name)
            r1_fastq_name.read_number = 1
            r1_fastq_name = str(r1_fastq_name)
            logger.debug("--    -> R1: %s" % r1_fastq_name)
            # Locate corresponding data
            r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                              results_r1)[0]
            r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
        # Write the data into the tabfile
        paired_end = ('Y' if self._illumina_data.paired_end else 'N')
        for fastq in results:
            # Check for existing entry
            existing_entry = False
            for line in self._stats:
                if (line['Project'] == fastq.project
                        and line['Sample'] == fastq.sample
                        and line['Fastq'] == fastq.name):
                    # Overwrite the existing entry
                    existing_entry = True
                    break
            # Write the data
            if not existing_entry:
                # Append new entry
                data = [
                    fastq.project, fastq.sample, fastq.name,
                    bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                    paired_end, fastq.read_number
                ]
                for lane in lanes:
                    try:
                        data.append(fastq.reads_by_lane[lane])
                    except:
                        data.append('')
                self._stats.append(data=data)
            else:
                # Overwrite existing entry
                logging.warning("Overwriting exisiting entry for "
                                "%s/%s/%s" %
                                (fastq.project, fastq.sample, fastq.name))
                line['Size'] = bcf_utils.format_file_size(fastq.fsize)
                line['Nreads'] = fastq.nreads
                line['Paired_end'] = paired_end
                line['Read_number'] = fastq.read_number
                for lane in lanes:
                    lane_name = "L%d" % lane
                    try:
                        line[lane_name] = fastq.reads_by_lane[lane]
                    except:
                        line[lane_name] = ''

    @property
    def lane_names(self):
        """
        Return list of lane names (e.g. ['L1','L2',...])
        """
        return [("L%d" % l) for l in self._lanes]

    @property
    def raw(self):
        """
        Return the 'raw' statistics TabFile instance
        """
        return self._stats

    def report_full_stats(self, out_file=None, fp=None):
        """
        Report all statistics gathered for all FASTQs

        Essentially a dump of all the data.

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        self._stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_basic_stats(self, out_file=None, fp=None):
        """
        Report the 'basic' statistics

        For each FASTQ file, report the following information:

        - Project name
        - Sample name
        - FASTQ file name (without leading directory)
        - Size (human-readable)
        - Nreads (number of reads)
        - Paired_end ('Y' for paired-end, 'N' for single-end)

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size',
                                      'Nreads', 'Paired_end'))
        for line in self._stats:
            data = [line[c] for c in stats.header()]
            stats.append(data=data)
        stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_sample_stats(self, out_file=None, fp=None):
        """
        Report of reads per sample in each lane

        Reports the number of reads for each sample in each
        lane plus the total reads for each lane.

        Example output:

        Lane 1
        Total reads = 182851745
        - KatyDobbs/KD-K1      79888058        43.7%
        - KatyDobbs/KD-K3      97854292        53.5%
        - Undetermined_indices/lane1       5109395 2.8%
        ...

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Report
        lanes = self.lane_names
        for lane in lanes:
            lane_number = int(lane[1:])
            samples = filter(lambda x: x['Read_number'] == 1 and bool(x[lane]),
                             self._stats)
            try:
                total_reads = sum([int(s[lane]) for s in samples])
            except Exception as ex:
                for s in samples:
                    try:
                        int(s[lane])
                    except ValueError:
                        logging.critical("Bad value for read count in "
                                         "lane %s sample %s: '%s'" %
                                         (lane, s['Sample'], s[lane]))
                raise ex
            fpp.write("\nLane %d\n" % lane_number)
            fpp.write("Total reads = %d\n" % total_reads)
            for sample in samples:
                sample_name = "%s/%s" % (sample['Project'], sample['Sample'])
                nreads = float(sample[lane])
                fpp.write("- %s\t%d\t%.1f%%\n" %
                          (sample_name, nreads, nreads / total_reads * 100.0))
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

    def report_per_lane_summary_stats(self, out_file=None, fp=None):
        """
        Report summary of total and unassigned reads per-lane

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file, 'w')
        else:
            fpp = fp
        # Set up TabFile to hold the data collected
        per_lane_stats = TabFile(column_names=('Lane', 'Total reads',
                                               'Assigned reads',
                                               'Unassigned reads', '%assigned',
                                               '%unassigned'))
        # Initialise counts for each lane
        assigned = {}
        unassigned = {}
        for lane in self.lane_names:
            assigned[lane] = 0
            unassigned[lane] = 0
        # Count assigned and unassigned (= undetermined) reads
        for line in filter(
                lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[
                    'Fastq']).is_index_read, self._stats):
            if line['Project'] != 'Undetermined_indices':
                counts = assigned
            else:
                counts = unassigned
            for lane in self.lane_names:
                if line[lane]:
                    try:
                        counts[lane] += line[lane]
                    except KeyError:
                        counts[lane] = line[lane]
        # Write out data for each lane
        for lane in self.lane_names:
            lane_number = int(lane[1:])
            assigned_reads = assigned[lane]
            try:
                unassigned_reads = unassigned[lane]
            except KeyError:
                # lane doesn't have any unassigned reads
                unassigned_reads = 0
            total_reads = assigned_reads + unassigned_reads
            if total_reads > 0:
                percent_assigned = float(assigned_reads)/ \
                                   float(total_reads)*100.0
                percent_unassigned = float(unassigned_reads)/ \
                                     float(total_reads)*100.0
            else:
                percent_assigned = 0.0
                percent_unassigned = 0.0
            per_lane_stats.append(data=("Lane %d" % lane_number, total_reads,
                                        assigned_reads, unassigned_reads,
                                        "%.2f" % percent_assigned,
                                        "%.2f" % percent_unassigned))
        # Write to file
        per_lane_stats.write(fp=fpp, include_header=True)
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

Пример #3

Показать файл

    # Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error(
            "input XLS comes from MACS %s, this version only handles 1.4" %
            macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'], reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'], reverse=True)

    # Restore first line
    data.insert(0, tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1, len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns([
        'order', 'chr', 'start', 'end', 'length', 'summit', 'tags',
        '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)'
    ])

    # Legnds text
    legends_text = """order\tSorting order Pvalue and FE
chr\tChromosome location of binding region
start\tStart coordinate of binding region
end\tEnd coordinate of binding region

Пример #4

Показать файл

     if well_list is not None:
         # Initialise barcode and sample names from well list
         stats_data = TabFile(column_names=('Barcode', 'Sample'))
         for barcode in well_list.barcodes():
             stats_data.append(data=(barcode,
                                     well_list.sample(barcode)))
     else:
         # Barcodes from collected data
         stats_data = TabFile(column_names=('Barcode', ))
         for barcode in stats.barcodes():
             stats_data.append(data=(barcode, ))
 else:
     # Append to an existing file
     stats_data = TabFile(filen=stats_file, first_line_is_header=True)
 # Add new columns of data
 stats_data.appendColumn(nreads_col)
 stats_data.appendColumn(umis_col)
 # Populate columns
 for data_line in stats_data:
     barcode = data_line['Barcode']
     try:
         data_line[nreads_col] = stats.nreads(barcode)
         data_line[umis_col] = len(stats.distinct_umis(barcode))
     except KeyError:
         data_line[nreads_col] = 0
         data_line[umis_col] = 0
 # Deal with 'unassigned' reads
 if args.unassigned:
     # Count reads for barcodes not in list
     unassigned_reads = 0
     unassigned_umis = set()