def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options['input_maf'],
            validation_stringency=ValidationStringency.Strict)

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options['output_maf'],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict)

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(['gdc_pon', 'common_in_exac'])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record['callers'].value
                if len(callers) >= self.options['min_callers'] and \
                  record['Mutation_Status'].value.value == 'Somatic':

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record['GDC_FILTER'].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        if len(gfset - hotspot_gdc_set) == 0:
                            self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()
    def load_readers(self):
        """
        Loads the array of MafReaders and sets the callers list.
        """
        maf_keys = [
            'mutect2', 'muse', 'vardict', 'varscan2', 'somaticsniper', 'pindel'
        ]

        for maf_key in maf_keys:
            if self.options[maf_key]:
                self.logger.info("{0} MAF {1}".format(maf_key,
                                                      self.options[maf_key]))
                self.maf_readers.append(
                    MafReader.reader_from(
                        path=self.options[maf_key],
                        validation_stringency=ValidationStringency.Strict))
                self.callers.append(maf_key)
Пример #3
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        if options.output is None:
            handle = sys.stdout
        else:
            handle = open(options.output, "w")

        errors = ValidationErrors(
            options.max_errors if options.max_errors else sys.maxsize)

        for path in options.input:
            logger.info("Examining %s", path)

            # Gather as many errors as possible
            silent = ValidationStringency.Silent
            reader = MafReader.reader_from(path=path,
                                           validation_stringency=silent,
                                           scheme=options.scheme)

            if not cls.__process_errors(options, reader, logger, handle,
                                        errors):
                n = 0
                for _ in reader:
                    if cls.__process_errors(options, reader, logger, handle,
                                            errors):
                        break
                    n = n + 1
                    if n % Validate.print_every_n_records == 0:
                        logger.info("Processed %d records" % n)
                if n == 0 or n % Validate.print_every_n_records != 0:
                    logger.info("Processed %d records" % n)

            reader.close()

            cls.__print_report(options, errors, handle)

        if options.output:
            handle.close()
Пример #4
0
    def test_with_sorting(self):
        scheme = TestMafWriter.TestCoordinateScheme()
        fd, path = tempfile.mkstemp()

        # Create the header
        header_lines = (MafHeader.scheme_header_lines(scheme) +
                        ["#key1 value1", "#key2 value2"] + [
                            "%s%s %s" % (
                                MafHeader.HeaderLineStartSymbol,
                                MafHeader.SortOrderKey,
                                Coordinate().name(),
                            )
                        ] + ["\t".join(scheme.column_names())])
        header = MafHeader.from_lines(
            lines=header_lines,
            validation_stringency=ValidationStringency.Silent)

        # Write the header, and the record twice
        writer = MafWriter.from_path(
            header=header,
            validation_stringency=ValidationStringency.Lenient,
            path=path,
            assume_sorted=False,
        )
        writer += TestMafWriter.DummyRecord("chr1", 2, 2)
        writer += TestMafWriter.DummyRecord("chr1", 3, 3)
        writer += TestMafWriter.DummyRecord("chr1", 4, 4)
        writer.close()

        reader = MafReader.reader_from(path=path, scheme=scheme)
        header = reader.header()
        records = [rec for rec in reader]
        reader.close()

        self.assertEqual(header.sort_order().name(), Coordinate.name())

        self.assertListEqual([r["Start_Position"].value for r in records],
                             [2, 3, 4])
        self.assertListEqual([r["End_Position"].value for r in records],
                             [2, 3, 4])
Пример #5
0
    def test_reader_out_of_order(self):
        column_names = ["Chromosome", "Start_Position", "End_Position"]
        scheme = NoRestrictionsScheme(column_names)
        header_version = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            scheme.version(),
        )
        header_sort_order = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.SortOrderKey,
            Coordinate(),
        )

        lines = [
            header_version,
            header_sort_order,
            "\t".join(column_names),
            "\t".join(["A", "1", "1"]),
            "\t".join(["A", "4", "4"]),
            "\t".join(["A", "2", "2"]),
        ]

        fh, fn = tmp_file(lines=lines)
        fh.close()

        reader = MafReader.reader_from(
            path=fn,
            validation_stringency=ValidationStringency.Silent,
            scheme=scheme)

        self.assertEqual(reader.scheme().version(), scheme.version())
        self.assertEqual(reader.header().version(), scheme.version())
        self.assertEqual(reader.header().sort_order().name(),
                         Coordinate().name())

        with self.assertRaises(ValueError):
            records = [record for record in reader]

        reader.close()
Пример #6
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        reader = MafReader.reader_from(
            path=options.input,
            validation_stringency=options.validation_stringency,
            scheme=options.scheme)

        writer = writer_from_reader(reader=reader, options=options)

        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=options.sort_order,
                           scheme=writer.header().scheme(),
                           fasta_index=options.fasta_index)

        # add the records to the sorter
        n = 0
        for record in reader:
            sorter += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Sorted %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Sorted %d records" % n)

        # read from the sorter
        n = 0
        for record in sorter:
            writer += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Wrote %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Wrote %d records" % n)

        sorter.close()
        reader.close()
        writer.close()
Пример #7
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        reader = MafReader.reader_from(
            path=options.input,
            validation_stringency=options.validation_stringency,
            scheme=options.scheme)

        writer = writer_from_reader(reader=reader, options=options)

        n = 0
        for record in reader:
            writer += record
            n = n + 1
            if options.output and n % View.print_every_n_records == 0:
                logger.info("Processed %d records" % n)
        if options.output and (n == 0 or n % View.print_every_n_records != 0):
            logger.info("Processed %d records" % n)

        reader.close()
        writer.close()
Пример #8
0
    def test_reader_from_with_scheme(self):
        scheme = TestMafReader.TestScheme()
        header = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            scheme.version(),
        )
        column_names = scheme.column_names()

        lines = [
            header,
            "\t".join(column_names),
            "\t".join(["cell-1-1", "1.314", "cell-1-2"]),
            "\t".join(["cell-2-1", "2.314", "cell-2-2"]),
            "\t".join(["cell-3-1", "3.314", "cell-3-2"]),
        ]

        fh, fn = tmp_file(lines=lines)
        fh.close()

        reader = MafReader.reader_from(
            path=fn,
            validation_stringency=ValidationStringency.Silent,
            scheme=scheme)
        records = [record for record in reader]

        self.assertEqual(reader.scheme().version(), scheme.version())
        self.assertEqual(reader.header().version(), scheme.version())
        self.assertEqual(len(reader.header()), 1)
        self.assertEqual(len(records), 3)
        self.assertListEqual([r["str1"].value for r in records],
                             ["cell-1-1", "cell-2-1", "cell-3-1"])
        self.assertListEqual([r["float"].value for r in records],
                             [1.314, 2.314, 3.314])

        reader.close()
    def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options["input_maf"],
            validation_stringency=ValidationStringency.Strict,
        )

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options["output_maf"],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict,
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"])
        nonexonic_set = set(["NonExonic"])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record["callers"].value
                if (len(callers) >= self.options["min_callers"] and
                        record["Mutation_Status"].value.value == "Somatic"):

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record["GDC_FILTER"].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        other_filts = gfset - hotspot_gdc_set
                        if len(other_filts) == 0:
                            self.write_record(record)
                        elif len(other_filts - nonexonic_set
                                 ) == 0 and self.is_splice(record):
                            # Rescue splicing if NonExonic
                            self.write_record(record)

                    # Rescue splicing if NonExonic
                    elif len(gfset -
                             nonexonic_set) == 0 and self.is_splice(record):
                        self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()