Exemplo n.º 1
0
    def test_from_reader(self):
        scheme = GdcV1_0_0_ProtectedScheme()

        lines = [
            TestMafHeader.__version_line, TestMafHeader.__annotation_line,
            TestMafHeader.__sort_order_line
        ]
        reader = MafReader(lines=lines,
                           validation_stringency=ValidationStringency.Silent,
                           scheme=scheme)
        reader.close()

        # No overrides
        header = MafHeader.from_reader(reader=reader)
        self.assertEqual(header.scheme().version(), scheme.version())
        self.assertEqual(header.scheme().annotation_spec(),
                         scheme.annotation_spec())
        self.assertEqual(header.sort_order().name(), Coordinate.name())

        # Override version and annotation
        scheme = GdcV1_0_0_PublicScheme()
        header = MafHeader.from_reader(reader=reader,
                                       version=scheme.version(),
                                       annotation=scheme.annotation_spec(),
                                       sort_order=sort_order.Unsorted().name())
        self.assertEqual(header.scheme().version(), scheme.version())
        self.assertEqual(header.scheme().annotation_spec(),
                         scheme.annotation_spec())
        self.assertEqual(header.sort_order().name(),
                         sort_order.Unsorted().name())
Exemplo n.º 2
0
    def test_different_column_names_but_same_named_scheme(self):
        lines = [
            TestMafReader.Version,
            TestMafReader.AnnotationSpec,
            "\t".join(reversed(TestMafReader.Names)),
        ]

        # When a scheme is given, we should get get a few
        reader = MafReader(lines=lines, scheme=TestMafReader.Scheme)
        self.assertIsNone(next(reader, None))
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [
                MafValidationErrorType.HEADER_UNSUPPORTED_VERSION,
                MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
                MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES,
                MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES,
            ],
        )

        # But when no scheme is given, then we should not get a HEADER_MISMATCH_SCHEME error
        reader = MafReader(lines=lines, scheme=None)
        self.assertIsNone(next(reader, None))
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [
                MafValidationErrorType.HEADER_UNSUPPORTED_VERSION,
                MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
            ],
        )
Exemplo n.º 3
0
    def test_use_scheme_from_header_basic(self):
        version = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            GdcV1_0_0_BasicScheme.version(),
        )
        lines = [version, "\t".join(GdcV1_0_0_BasicScheme().column_names())]

        reader = MafReader(lines=lines, scheme=None)
        self.assertIsNone(next(reader, None))
        self.assertEqual(len(reader.validation_errors), 0)
        self.assertEqual(reader.header().scheme().version(),
                         GdcV1_0_0_BasicScheme().version())
    def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options['input_maf'],
            validation_stringency=ValidationStringency.Strict)

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options['output_maf'],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict)

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(['gdc_pon', 'common_in_exac'])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record['callers'].value
                if len(callers) >= self.options['min_callers'] and \
                  record['Mutation_Status'].value.value == 'Somatic':

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record['GDC_FILTER'].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        if len(gfset - hotspot_gdc_set) == 0:
                            self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()
Exemplo n.º 5
0
    def test_iter(self):
        lines = [
            TestMafReader.Version,
            TestMafReader.AnnotationSpec,
            "\t".join(reversed(TestMafReader.Names)),
            "\t".join(["string1", "3.14", "string2"]),
        ]

        reader = MafReader(lines=lines, scheme=None)
        self.assertEqual(len([_ for _ in reader]), 1)
Exemplo n.º 6
0
    def test_use_default_scheme(self):
        lines = [
            TestMafReader.AnnotationSpec,
            "\t".join(GdcV1_0_0_BasicScheme().column_names()),
        ]

        reader = MafReader(lines=lines, scheme=None)
        self.assertIsNone(next(reader, None))
        self.assertEqual(len(reader.validation_errors), 2)
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [
                MafValidationErrorType.HEADER_MISSING_VERSION,
                MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
            ],
        )
        self.assertIsNone(reader.header().scheme())
        self.assertEqual(
            reader.scheme().version(),
            NoRestrictionsScheme(column_names=list()).version(),
        )
Exemplo n.º 7
0
    def test_use_different_scheme(self):
        lines = [
            "%s%s %s" % (
                MafHeader.HeaderLineStartSymbol,
                MafHeader.VersionKey,
                GdcV1_0_0_ProtectedScheme.version(),
            ),
            "%s%s %s" % (
                MafHeader.HeaderLineStartSymbol,
                MafHeader.AnnotationSpecKey,
                GdcV1_0_0_ProtectedScheme.annotation_spec(),
            ),
            "\t".join(GdcV1_0_0_BasicScheme().column_names()),
        ]

        reader = MafReader(lines=lines, scheme=None)
        self.assertIsNone(next(reader, None))
        self.assertEqual(len(reader.validation_errors), 1)
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES],
        )
        self.assertEqual(reader.header().scheme().version(),
                         GdcV1_0_0_BasicScheme.version())
        self.assertEqual(reader.scheme().version(),
                         GdcV1_0_0_ProtectedScheme().version())
        self.assertEqual(
            reader.scheme().annotation_spec(),
            GdcV1_0_0_ProtectedScheme.annotation_spec(),
        )
        self.assertEqual(
            reader.header().scheme().annotation_spec(),
            GdcV1_0_0_ProtectedScheme.annotation_spec(),
        )
Exemplo n.º 8
0
    def test_missing_column_names(self):
        lines = [TestMafReader.Version, TestMafReader.AnnotationSpec]

        # When a scheme is given, we should get get a few errors
        reader = MafReader(lines=lines, scheme=TestMafReader.Scheme)
        self.assertIsNone(next(reader, None))
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [
                MafValidationErrorType.HEADER_UNSUPPORTED_VERSION,
                MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
                MafValidationErrorType.HEADER_MISSING_COLUMN_NAMES,
            ],
        )
Exemplo n.º 9
0
    def test_different_scheme_in_header(self):
        version = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            GdcV1_0_0_BasicScheme.version(),
        )
        # annotation = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.AnnotationSpecKey, TestMafReader.Scheme.annotation_spec())
        lines = [version, "\t".join(TestMafReader.Names)]

        # When a scheme is given, we should get a HEADER_MISMATCH_SCHEME error
        reader = MafReader(lines=lines, scheme=TestMafReader.Scheme)
        self.assertIsNone(next(reader, None))
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [MafValidationErrorType.HEADER_MISMATCH_SCHEME],
        )

        # But when no scheme is given, then we should get a SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES error
        reader = MafReader(lines=lines, scheme=None)
        self.assertIsNone(next(reader, None))
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES],
        )
    def load_readers(self):
        """
        Loads the array of MafReaders and sets the callers list.
        """
        maf_keys = [
            'mutect2', 'muse', 'vardict', 'varscan2', 'somaticsniper', 'pindel'
        ]

        for maf_key in maf_keys:
            if self.options[maf_key]:
                self.logger.info("{0} MAF {1}".format(maf_key,
                                                      self.options[maf_key]))
                self.maf_readers.append(
                    MafReader.reader_from(
                        path=self.options[maf_key],
                        validation_stringency=ValidationStringency.Strict))
                self.callers.append(maf_key)
Exemplo n.º 11
0
    def test_record_errors(self):
        lines = [
            TestMafReader.Version,
            TestMafReader.AnnotationSpec,
            "\t".join(TestMafReader.Names),
            "\t".join(["string1", "string-float", "string2"]),
        ]

        reader = MafReader(lines=lines, scheme=TestMafReader.Scheme)
        self.assertEqual(len([_ for _ in reader]), 1)
        self.assertListEqual(
            [e.tpe for e in reader.validation_errors],
            [
                MafValidationErrorType.HEADER_UNSUPPORTED_VERSION,
                MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
                MafValidationErrorType.RECORD_INVALID_COLUMN_VALUE,
                MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE,
            ],
        )
Exemplo n.º 12
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        if options.output is None:
            handle = sys.stdout
        else:
            handle = open(options.output, "w")

        errors = ValidationErrors(
            options.max_errors if options.max_errors else sys.maxsize)

        for path in options.input:
            logger.info("Examining %s", path)

            # Gather as many errors as possible
            silent = ValidationStringency.Silent
            reader = MafReader.reader_from(path=path,
                                           validation_stringency=silent,
                                           scheme=options.scheme)

            if not cls.__process_errors(options, reader, logger, handle,
                                        errors):
                n = 0
                for _ in reader:
                    if cls.__process_errors(options, reader, logger, handle,
                                            errors):
                        break
                    n = n + 1
                    if n % Validate.print_every_n_records == 0:
                        logger.info("Processed %d records" % n)
                if n == 0 or n % Validate.print_every_n_records != 0:
                    logger.info("Processed %d records" % n)

            reader.close()

            cls.__print_report(options, errors, handle)

        if options.output:
            handle.close()
Exemplo n.º 13
0
    def test_with_sorting(self):
        scheme = TestMafWriter.TestCoordinateScheme()
        fd, path = tempfile.mkstemp()

        # Create the header
        header_lines = (MafHeader.scheme_header_lines(scheme) +
                        ["#key1 value1", "#key2 value2"] + [
                            "%s%s %s" % (
                                MafHeader.HeaderLineStartSymbol,
                                MafHeader.SortOrderKey,
                                Coordinate().name(),
                            )
                        ] + ["\t".join(scheme.column_names())])
        header = MafHeader.from_lines(
            lines=header_lines,
            validation_stringency=ValidationStringency.Silent)

        # Write the header, and the record twice
        writer = MafWriter.from_path(
            header=header,
            validation_stringency=ValidationStringency.Lenient,
            path=path,
            assume_sorted=False,
        )
        writer += TestMafWriter.DummyRecord("chr1", 2, 2)
        writer += TestMafWriter.DummyRecord("chr1", 3, 3)
        writer += TestMafWriter.DummyRecord("chr1", 4, 4)
        writer.close()

        reader = MafReader.reader_from(path=path, scheme=scheme)
        header = reader.header()
        records = [rec for rec in reader]
        reader.close()

        self.assertEqual(header.sort_order().name(), Coordinate.name())

        self.assertListEqual([r["Start_Position"].value for r in records],
                             [2, 3, 4])
        self.assertListEqual([r["End_Position"].value for r in records],
                             [2, 3, 4])
Exemplo n.º 14
0
    def test_reader_out_of_order(self):
        column_names = ["Chromosome", "Start_Position", "End_Position"]
        scheme = NoRestrictionsScheme(column_names)
        header_version = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            scheme.version(),
        )
        header_sort_order = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.SortOrderKey,
            Coordinate(),
        )

        lines = [
            header_version,
            header_sort_order,
            "\t".join(column_names),
            "\t".join(["A", "1", "1"]),
            "\t".join(["A", "4", "4"]),
            "\t".join(["A", "2", "2"]),
        ]

        fh, fn = tmp_file(lines=lines)
        fh.close()

        reader = MafReader.reader_from(
            path=fn,
            validation_stringency=ValidationStringency.Silent,
            scheme=scheme)

        self.assertEqual(reader.scheme().version(), scheme.version())
        self.assertEqual(reader.header().version(), scheme.version())
        self.assertEqual(reader.header().sort_order().name(),
                         Coordinate().name())

        with self.assertRaises(ValueError):
            records = [record for record in reader]

        reader.close()
Exemplo n.º 15
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        reader = MafReader.reader_from(
            path=options.input,
            validation_stringency=options.validation_stringency,
            scheme=options.scheme)

        writer = writer_from_reader(reader=reader, options=options)

        n = 0
        for record in reader:
            writer += record
            n = n + 1
            if options.output and n % View.print_every_n_records == 0:
                logger.info("Processed %d records" % n)
        if options.output and (n == 0 or n % View.print_every_n_records != 0):
            logger.info("Processed %d records" % n)

        reader.close()
        writer.close()
Exemplo n.º 16
0
    def __main__(cls, options):
        """The main method."""
        logger = Logger.get_logger(cls.__name__)

        reader = MafReader.reader_from(
            path=options.input,
            validation_stringency=options.validation_stringency,
            scheme=options.scheme)

        writer = writer_from_reader(reader=reader, options=options)

        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=options.sort_order,
                           scheme=writer.header().scheme(),
                           fasta_index=options.fasta_index)

        # add the records to the sorter
        n = 0
        for record in reader:
            sorter += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Sorted %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Sorted %d records" % n)

        # read from the sorter
        n = 0
        for record in sorter:
            writer += record
            n = n + 1
            if options.output and n % Sort.print_every_n_records == 0:
                logger.info("Wrote %d records" % n)
        if options.output and (n == 0 or n % Sort.print_every_n_records != 0):
            logger.info("Wrote %d records" % n)

        sorter.close()
        reader.close()
        writer.close()
Exemplo n.º 17
0
    def test_reader_from_with_scheme(self):
        scheme = TestMafReader.TestScheme()
        header = "%s%s %s" % (
            MafHeader.HeaderLineStartSymbol,
            MafHeader.VersionKey,
            scheme.version(),
        )
        column_names = scheme.column_names()

        lines = [
            header,
            "\t".join(column_names),
            "\t".join(["cell-1-1", "1.314", "cell-1-2"]),
            "\t".join(["cell-2-1", "2.314", "cell-2-2"]),
            "\t".join(["cell-3-1", "3.314", "cell-3-2"]),
        ]

        fh, fn = tmp_file(lines=lines)
        fh.close()

        reader = MafReader.reader_from(
            path=fn,
            validation_stringency=ValidationStringency.Silent,
            scheme=scheme)
        records = [record for record in reader]

        self.assertEqual(reader.scheme().version(), scheme.version())
        self.assertEqual(reader.header().version(), scheme.version())
        self.assertEqual(len(reader.header()), 1)
        self.assertEqual(len(records), 3)
        self.assertListEqual([r["str1"].value for r in records],
                             ["cell-1-1", "cell-2-1", "cell-3-1"])
        self.assertListEqual([r["float"].value for r in records],
                             [1.314, 2.314, 3.314])

        reader.close()
    def do_work(self):
        """Main wrapper function for running public MAF filter"""
        self.logger.info("Processing input maf {0}...".format(
            self.options["input_maf"]))

        # Reader
        self.maf_reader = MafReader.reader_from(
            path=self.options["input_maf"],
            validation_stringency=ValidationStringency.Strict,
        )

        # Header
        self.setup_maf_header()

        # Writer
        self.maf_writer = MafWriter.from_path(
            path=self.options["output_maf"],
            header=self.maf_header,
            validation_stringency=ValidationStringency.Strict,
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Counts
        processed = 0
        hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"])
        nonexonic_set = set(["NonExonic"])

        try:
            for record in self.maf_reader:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} records...".format(processed))

                callers = record["callers"].value
                if (len(callers) >= self.options["min_callers"] and
                        record["Mutation_Status"].value.value == "Somatic"):

                    self.metrics.add_sample_swap_metric(record)

                    gdc_filters = record["GDC_FILTER"].value
                    gfset = set(gdc_filters)

                    if self.is_hotspot(record):
                        other_filts = gfset - hotspot_gdc_set
                        if len(other_filts) == 0:
                            self.write_record(record)
                        elif len(other_filts - nonexonic_set
                                 ) == 0 and self.is_splice(record):
                            # Rescue splicing if NonExonic
                            self.write_record(record)

                    # Rescue splicing if NonExonic
                    elif len(gfset -
                             nonexonic_set) == 0 and self.is_splice(record):
                        self.write_record(record)

                    elif not gfset:
                        self.write_record(record)

                processed += 1
                self.metrics.input_records += 1

            self.logger.info("Processed {0} records.".format(processed))
            print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True))

        finally:

            self.maf_reader.close()
            self.maf_writer.close()