def test_from_reader(self): scheme = GdcV1_0_0_ProtectedScheme() lines = [ TestMafHeader.__version_line, TestMafHeader.__annotation_line, TestMafHeader.__sort_order_line ] reader = MafReader(lines=lines, validation_stringency=ValidationStringency.Silent, scheme=scheme) reader.close() # No overrides header = MafHeader.from_reader(reader=reader) self.assertEqual(header.scheme().version(), scheme.version()) self.assertEqual(header.scheme().annotation_spec(), scheme.annotation_spec()) self.assertEqual(header.sort_order().name(), Coordinate.name()) # Override version and annotation scheme = GdcV1_0_0_PublicScheme() header = MafHeader.from_reader(reader=reader, version=scheme.version(), annotation=scheme.annotation_spec(), sort_order=sort_order.Unsorted().name()) self.assertEqual(header.scheme().version(), scheme.version()) self.assertEqual(header.scheme().annotation_spec(), scheme.annotation_spec()) self.assertEqual(header.sort_order().name(), sort_order.Unsorted().name())
def test_different_column_names_but_same_named_scheme(self): lines = [ TestMafReader.Version, TestMafReader.AnnotationSpec, "\t".join(reversed(TestMafReader.Names)), ] # When a scheme is given, we should get get a few reader = MafReader(lines=lines, scheme=TestMafReader.Scheme) self.assertIsNone(next(reader, None)) self.assertListEqual( [e.tpe for e in reader.validation_errors], [ MafValidationErrorType.HEADER_UNSUPPORTED_VERSION, MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES, MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES, ], ) # But when no scheme is given, then we should not get a HEADER_MISMATCH_SCHEME error reader = MafReader(lines=lines, scheme=None) self.assertIsNone(next(reader, None)) self.assertListEqual( [e.tpe for e in reader.validation_errors], [ MafValidationErrorType.HEADER_UNSUPPORTED_VERSION, MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, ], )
def test_use_scheme_from_header_basic(self): version = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, GdcV1_0_0_BasicScheme.version(), ) lines = [version, "\t".join(GdcV1_0_0_BasicScheme().column_names())] reader = MafReader(lines=lines, scheme=None) self.assertIsNone(next(reader, None)) self.assertEqual(len(reader.validation_errors), 0) self.assertEqual(reader.header().scheme().version(), GdcV1_0_0_BasicScheme().version())
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options['input_maf'], validation_stringency=ValidationStringency.Strict) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(['gdc_pon', 'common_in_exac']) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record['callers'].value if len(callers) >= self.options['min_callers'] and \ record['Mutation_Status'].value.value == 'Somatic': self.metrics.add_sample_swap_metric(record) gdc_filters = record['GDC_FILTER'].value gfset = set(gdc_filters) if self.is_hotspot(record): if len(gfset - hotspot_gdc_set) == 0: self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()
def test_iter(self): lines = [ TestMafReader.Version, TestMafReader.AnnotationSpec, "\t".join(reversed(TestMafReader.Names)), "\t".join(["string1", "3.14", "string2"]), ] reader = MafReader(lines=lines, scheme=None) self.assertEqual(len([_ for _ in reader]), 1)
def test_use_default_scheme(self): lines = [ TestMafReader.AnnotationSpec, "\t".join(GdcV1_0_0_BasicScheme().column_names()), ] reader = MafReader(lines=lines, scheme=None) self.assertIsNone(next(reader, None)) self.assertEqual(len(reader.validation_errors), 2) self.assertListEqual( [e.tpe for e in reader.validation_errors], [ MafValidationErrorType.HEADER_MISSING_VERSION, MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, ], ) self.assertIsNone(reader.header().scheme()) self.assertEqual( reader.scheme().version(), NoRestrictionsScheme(column_names=list()).version(), )
def test_use_different_scheme(self): lines = [ "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, GdcV1_0_0_ProtectedScheme.version(), ), "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.AnnotationSpecKey, GdcV1_0_0_ProtectedScheme.annotation_spec(), ), "\t".join(GdcV1_0_0_BasicScheme().column_names()), ] reader = MafReader(lines=lines, scheme=None) self.assertIsNone(next(reader, None)) self.assertEqual(len(reader.validation_errors), 1) self.assertListEqual( [e.tpe for e in reader.validation_errors], [MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES], ) self.assertEqual(reader.header().scheme().version(), GdcV1_0_0_BasicScheme.version()) self.assertEqual(reader.scheme().version(), GdcV1_0_0_ProtectedScheme().version()) self.assertEqual( reader.scheme().annotation_spec(), GdcV1_0_0_ProtectedScheme.annotation_spec(), ) self.assertEqual( reader.header().scheme().annotation_spec(), GdcV1_0_0_ProtectedScheme.annotation_spec(), )
def test_missing_column_names(self): lines = [TestMafReader.Version, TestMafReader.AnnotationSpec] # When a scheme is given, we should get get a few errors reader = MafReader(lines=lines, scheme=TestMafReader.Scheme) self.assertIsNone(next(reader, None)) self.assertListEqual( [e.tpe for e in reader.validation_errors], [ MafValidationErrorType.HEADER_UNSUPPORTED_VERSION, MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, MafValidationErrorType.HEADER_MISSING_COLUMN_NAMES, ], )
def test_different_scheme_in_header(self): version = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, GdcV1_0_0_BasicScheme.version(), ) # annotation = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.AnnotationSpecKey, TestMafReader.Scheme.annotation_spec()) lines = [version, "\t".join(TestMafReader.Names)] # When a scheme is given, we should get a HEADER_MISMATCH_SCHEME error reader = MafReader(lines=lines, scheme=TestMafReader.Scheme) self.assertIsNone(next(reader, None)) self.assertListEqual( [e.tpe for e in reader.validation_errors], [MafValidationErrorType.HEADER_MISMATCH_SCHEME], ) # But when no scheme is given, then we should get a SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES error reader = MafReader(lines=lines, scheme=None) self.assertIsNone(next(reader, None)) self.assertListEqual( [e.tpe for e in reader.validation_errors], [MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES], )
def load_readers(self): """ Loads the array of MafReaders and sets the callers list. """ maf_keys = [ 'mutect2', 'muse', 'vardict', 'varscan2', 'somaticsniper', 'pindel' ] for maf_key in maf_keys: if self.options[maf_key]: self.logger.info("{0} MAF {1}".format(maf_key, self.options[maf_key])) self.maf_readers.append( MafReader.reader_from( path=self.options[maf_key], validation_stringency=ValidationStringency.Strict)) self.callers.append(maf_key)
def test_record_errors(self): lines = [ TestMafReader.Version, TestMafReader.AnnotationSpec, "\t".join(TestMafReader.Names), "\t".join(["string1", "string-float", "string2"]), ] reader = MafReader(lines=lines, scheme=TestMafReader.Scheme) self.assertEqual(len([_ for _ in reader]), 1) self.assertListEqual( [e.tpe for e in reader.validation_errors], [ MafValidationErrorType.HEADER_UNSUPPORTED_VERSION, MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, MafValidationErrorType.RECORD_INVALID_COLUMN_VALUE, MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE, ], )
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) if options.output is None: handle = sys.stdout else: handle = open(options.output, "w") errors = ValidationErrors( options.max_errors if options.max_errors else sys.maxsize) for path in options.input: logger.info("Examining %s", path) # Gather as many errors as possible silent = ValidationStringency.Silent reader = MafReader.reader_from(path=path, validation_stringency=silent, scheme=options.scheme) if not cls.__process_errors(options, reader, logger, handle, errors): n = 0 for _ in reader: if cls.__process_errors(options, reader, logger, handle, errors): break n = n + 1 if n % Validate.print_every_n_records == 0: logger.info("Processed %d records" % n) if n == 0 or n % Validate.print_every_n_records != 0: logger.info("Processed %d records" % n) reader.close() cls.__print_report(options, errors, handle) if options.output: handle.close()
def test_with_sorting(self): scheme = TestMafWriter.TestCoordinateScheme() fd, path = tempfile.mkstemp() # Create the header header_lines = (MafHeader.scheme_header_lines(scheme) + ["#key1 value1", "#key2 value2"] + [ "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, Coordinate().name(), ) ] + ["\t".join(scheme.column_names())]) header = MafHeader.from_lines( lines=header_lines, validation_stringency=ValidationStringency.Silent) # Write the header, and the record twice writer = MafWriter.from_path( header=header, validation_stringency=ValidationStringency.Lenient, path=path, assume_sorted=False, ) writer += TestMafWriter.DummyRecord("chr1", 2, 2) writer += TestMafWriter.DummyRecord("chr1", 3, 3) writer += TestMafWriter.DummyRecord("chr1", 4, 4) writer.close() reader = MafReader.reader_from(path=path, scheme=scheme) header = reader.header() records = [rec for rec in reader] reader.close() self.assertEqual(header.sort_order().name(), Coordinate.name()) self.assertListEqual([r["Start_Position"].value for r in records], [2, 3, 4]) self.assertListEqual([r["End_Position"].value for r in records], [2, 3, 4])
def test_reader_out_of_order(self): column_names = ["Chromosome", "Start_Position", "End_Position"] scheme = NoRestrictionsScheme(column_names) header_version = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, scheme.version(), ) header_sort_order = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, Coordinate(), ) lines = [ header_version, header_sort_order, "\t".join(column_names), "\t".join(["A", "1", "1"]), "\t".join(["A", "4", "4"]), "\t".join(["A", "2", "2"]), ] fh, fn = tmp_file(lines=lines) fh.close() reader = MafReader.reader_from( path=fn, validation_stringency=ValidationStringency.Silent, scheme=scheme) self.assertEqual(reader.scheme().version(), scheme.version()) self.assertEqual(reader.header().version(), scheme.version()) self.assertEqual(reader.header().sort_order().name(), Coordinate().name()) with self.assertRaises(ValueError): records = [record for record in reader] reader.close()
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) reader = MafReader.reader_from( path=options.input, validation_stringency=options.validation_stringency, scheme=options.scheme) writer = writer_from_reader(reader=reader, options=options) n = 0 for record in reader: writer += record n = n + 1 if options.output and n % View.print_every_n_records == 0: logger.info("Processed %d records" % n) if options.output and (n == 0 or n % View.print_every_n_records != 0): logger.info("Processed %d records" % n) reader.close() writer.close()
def __main__(cls, options): """The main method.""" logger = Logger.get_logger(cls.__name__) reader = MafReader.reader_from( path=options.input, validation_stringency=options.validation_stringency, scheme=options.scheme) writer = writer_from_reader(reader=reader, options=options) sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=options.sort_order, scheme=writer.header().scheme(), fasta_index=options.fasta_index) # add the records to the sorter n = 0 for record in reader: sorter += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Sorted %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Sorted %d records" % n) # read from the sorter n = 0 for record in sorter: writer += record n = n + 1 if options.output and n % Sort.print_every_n_records == 0: logger.info("Wrote %d records" % n) if options.output and (n == 0 or n % Sort.print_every_n_records != 0): logger.info("Wrote %d records" % n) sorter.close() reader.close() writer.close()
def test_reader_from_with_scheme(self): scheme = TestMafReader.TestScheme() header = "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.VersionKey, scheme.version(), ) column_names = scheme.column_names() lines = [ header, "\t".join(column_names), "\t".join(["cell-1-1", "1.314", "cell-1-2"]), "\t".join(["cell-2-1", "2.314", "cell-2-2"]), "\t".join(["cell-3-1", "3.314", "cell-3-2"]), ] fh, fn = tmp_file(lines=lines) fh.close() reader = MafReader.reader_from( path=fn, validation_stringency=ValidationStringency.Silent, scheme=scheme) records = [record for record in reader] self.assertEqual(reader.scheme().version(), scheme.version()) self.assertEqual(reader.header().version(), scheme.version()) self.assertEqual(len(reader.header()), 1) self.assertEqual(len(records), 3) self.assertListEqual([r["str1"].value for r in records], ["cell-1-1", "cell-2-1", "cell-3-1"]) self.assertListEqual([r["float"].value for r in records], [1.314, 2.314, 3.314]) reader.close()
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options["input_maf"], validation_stringency=ValidationStringency.Strict, ) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"]) nonexonic_set = set(["NonExonic"]) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record["callers"].value if (len(callers) >= self.options["min_callers"] and record["Mutation_Status"].value.value == "Somatic"): self.metrics.add_sample_swap_metric(record) gdc_filters = record["GDC_FILTER"].value gfset = set(gdc_filters) if self.is_hotspot(record): other_filts = gfset - hotspot_gdc_set if len(other_filts) == 0: self.write_record(record) elif len(other_filts - nonexonic_set ) == 0 and self.is_splice(record): # Rescue splicing if NonExonic self.write_record(record) # Rescue splicing if NonExonic elif len(gfset - nonexonic_set) == 0 and self.is_splice(record): self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()