def writer_from_reader(reader, options): """ Builds a writer from the given reader and command line options. :param options: the command line options, which should have "output", "version", and "annotation" defined. :param reader: the reader from which to records will be obtained :return: """ out_header = MafHeader.from_reader( reader=reader, version=options.version, annotation=options.annotation, sort_order=options.sort_order \ if hasattr(options, 'sort_order') else None ) if options.output: writer = MafWriter.from_path( path=options.output, header=out_header, validation_stringency=options.validation_stringency) else: writer = MafWriter.from_fd( desc=sys.stdout, header=out_header, validation_stringency=options.validation_stringency) return writer
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options['input_maf'], validation_stringency=ValidationStringency.Strict) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(['gdc_pon', 'common_in_exac']) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record['callers'].value if len(callers) >= self.options['min_callers'] and \ record['Mutation_Status'].value.value == 'Somatic': self.metrics.add_sample_swap_metric(record) gdc_filters = record['GDC_FILTER'].value gfset = set(gdc_filters) if self.is_hotspot(record): if len(gfset - hotspot_gdc_set) == 0: self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()
def test_empty_file(self): fd, path = tempfile.mkstemp() # No logging to stderr/stdout with captured_output() as (stdout, stderr): writer = MafWriter.from_path( path=path, header=MafHeader(), validation_stringency=ValidationStringency.Silent, ) writer.close() self.assertEqual(read_lines(path), []) self.assertEqual(str(writer.header()), "") stdout = stdout.getvalue().rstrip('\r\n').split("\n") stderr = stderr.getvalue().rstrip('\r\n').split("\n") self.assertListEqual(stdout, ['']) self.assertListEqual(stderr, ['']) # Logging to stderr/stdout with captured_output() as (stdout, stderr): writer = MafWriter.from_path( path=path, header=MafHeader(), validation_stringency=ValidationStringency.Lenient, ) writer.close() self.assertEqual(read_lines(path), []) self.assertEqual(str(writer.header()), "") stdout = stdout.getvalue().rstrip('\r\n').split("\n") stderr = stderr.getvalue().rstrip('\r\n').split("\n") self.assertListEqual(stdout, ['']) self.assertListEqualAndIn( ['HEADER_MISSING_VERSION', 'HEADER_MISSING_ANNOTATION_SPEC'], stderr) # Exceptions with captured_output(): with self.assertRaises(MafFormatException) as context: writer = MafWriter.from_path( path=path, header=MafHeader(), validation_stringency=ValidationStringency.Strict, ) self.assertEqual(context.exception.tpe, MafValidationErrorType.HEADER_MISSING_VERSION)
def test_record_validation_error(self): scheme = TestMafWriter.TestScheme() fd, path = tempfile.mkstemp() # Create the header header_lines = (MafHeader.scheme_header_lines(scheme) + ["#key1 value1", "#key2 value2"] + ["str1\tNone\tstr2"]) header = MafHeader.from_lines( lines=header_lines, validation_stringency=ValidationStringency.Silent) # Create the record values = ["string2", "error", "string1"] record_line = MafRecord.ColumnSeparator.join(values) record = MafRecord.from_line( line=record_line, scheme=scheme, line_number=1, validation_stringency=ValidationStringency.Silent, ) # Write the header, and the record twice with captured_output() as (stdout, stderr): writer = MafWriter.from_path( header=header, validation_stringency=ValidationStringency.Lenient, path=path, ) writer += record writer.write(record) writer.close() stdout = stdout.getvalue().rstrip('\r\n').split("\n") stderr = stderr.getvalue().rstrip('\r\n').split("\n") self.assertListEqual(stdout, ['']) # The errors that should be written stderr errors = [ "HEADER_UNSUPPORTED_VERSION", "HEADER_UNSUPPORTED_ANNOTATION_SPEC", "RECORD_COLUMN_WITH_NO_VALUE", "RECORD_COLUMN_WITH_NO_VALUE", ] self.assertListEqualAndIn(errors, stderr) # The second column should be None err_record_line = record_line.replace("error", "None") self.assertListEqual(read_lines(path), header_lines + [err_record_line, err_record_line])
def test_close(self): fd, path = tempfile.mkstemp() lines = [ TestMafWriter.__version_line, TestMafWriter.__annotation_line, "#key1 value1", "#key2 value2", TestMafWriter.__keys_line, ] header = MafHeader.from_lines(lines=lines) writer = MafWriter.from_path(header=header, path=path) writer._handle.write("LAST") # Naughty writer.close() self.assertListEqual(read_lines(path), lines + ["LAST"]) with self.assertRaises(ValueError): writer._handle.write("Oh no")
def add_records(self): scheme = TestMafWriter.TestScheme() fd, path = tempfile.mkstemp() header_lines = MafHeader.scheme_header_lines(scheme) + [ "#key1 value1", "#key2 value2", ] header = MafHeader.from_lines(lines=header_lines) writer = MafWriter.from_path(header=header, path=path) values = ["string2", "3.14", "string1"] record_line = MafRecord.ColumnSeparator.join(values) record = MafRecord.from_line(line=record_line, scheme=scheme, line_number=1) writer += record writer.write(record) writer.close() self.assertListEqual(read_lines(path), header_lines + [record_line, record_line])
def test_with_sorting(self): scheme = TestMafWriter.TestCoordinateScheme() fd, path = tempfile.mkstemp() # Create the header header_lines = (MafHeader.scheme_header_lines(scheme) + ["#key1 value1", "#key2 value2"] + [ "%s%s %s" % ( MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, Coordinate().name(), ) ] + ["\t".join(scheme.column_names())]) header = MafHeader.from_lines( lines=header_lines, validation_stringency=ValidationStringency.Silent) # Write the header, and the record twice writer = MafWriter.from_path( header=header, validation_stringency=ValidationStringency.Lenient, path=path, assume_sorted=False, ) writer += TestMafWriter.DummyRecord("chr1", 2, 2) writer += TestMafWriter.DummyRecord("chr1", 3, 3) writer += TestMafWriter.DummyRecord("chr1", 4, 4) writer.close() reader = MafReader.reader_from(path=path, scheme=scheme) header = reader.header() records = [rec for rec in reader] reader.close() self.assertEqual(header.sort_order().name(), Coordinate.name()) self.assertListEqual([r["Start_Position"].value for r in records], [2, 3, 4]) self.assertListEqual([r["End_Position"].value for r in records], [2, 3, 4])
def test_gz_support(self): fd, path = tempfile.mkstemp(suffix=".gz") lines = [ TestMafWriter.__version_line, TestMafWriter.__annotation_line, "#key1 value1", "#key2 value2", TestMafWriter.__keys_line, ] with captured_output() as (stdout, stderr): header = MafHeader.from_lines(lines=lines) writer = MafWriter.from_path(header=header, path=path) writer.close() self.assertListEqual(read_lines(path), lines) self.assertEqual( str(writer.header()) + "\n" + TestMafWriter.__keys_line, "\n".join(lines), ) stdout = stdout.getvalue().rstrip('\r\n').split("\n") stderr = stderr.getvalue().rstrip('\r\n').split("\n") self.assertListEqual(stdout, ['']) self.assertListEqual(stderr, [''])
def do_work(self): """Main wrapper function for running protect MAF merging""" # Reader self.load_readers() # Header self.setup_maf_header() self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) # Sorter sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), contigs=self.maf_header.contigs()) # Merger self._merger = MafRecordMerger_1_0_0(self._scheme) # Overlap iterator o_iter = LocatableOverlapIterator( self.maf_readers, contigs=self.maf_header.contigs(), peekable_iterator_class=FilteringPeekableIterator) # ndp filter ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth']) ndp_tag = ndp_filter.tags[0] # Counts processed = 0 try: for record in o_iter: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} overlapping intervals...".format( processed)) result = OverlapSet(record, self.callers) for maf_record in self._merger.merge_records(result): if maf_record is not None: # Recheck normal depth gdc_filters = maf_record['GDC_FILTER'].value has_tag = ndp_tag in gdc_filters ndp = ndp_filter.filter(maf_record) if has_tag != ndp: if ndp: gdc_filters.extend(ndp_filter.tags) else: gdc_filters = list( filter(lambda x: x != ndp_filter.tags[0], gdc_filters)) maf_record["GDC_FILTER"] = get_builder( "GDC_FILTER", self._scheme, value=sorted(gdc_filters)) # Add to sorter sorter += maf_record processed += 1 self.logger.info( "Writing {0} sorted, merged records...".format(processed)) # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) counter = 0 for record in sorter: if counter > 0 and counter % 1000 == 0: self.logger.info( "Wrote {0} sorted, merged records...".format(counter)) self.maf_writer += record counter += 1 self.logger.info( "Finished writing {0} sorted, merged records.".format(counter)) finally: for reader in self.maf_readers: reader.close() sorter.close() if self.maf_writer: self.maf_writer.close()
def do_work(self): """Main wrapper function for running vcf2maf""" self.logger.info( "Processing input vcf {0}...".format(self.options["input_vcf"]) ) # Initialize the maf file self.setup_maf_header() sorter = MafSorter( max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), fasta_index=self.options["reference_fasta_index"], ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Initialize vcf reader vcf_object = pysam.VariantFile(self.options["input_vcf"]) tumor_sample_id = self.options["tumor_vcf_id"] normal_sample_id = self.options["normal_vcf_id"] is_tumor_only = self.options["tumor_only"] try: # Validate samples tumor_idx = assert_sample_in_header( vcf_object, self.options["tumor_vcf_id"] ) normal_idx = assert_sample_in_header( vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only ) # extract annotation from header ann_cols_format, vep_key = extract_annotation_from_header( vcf_object, vep_key="CSQ" ) # Initialize annotators self.setup_annotators() # Initialize filters self.setup_filters() # Convert line = 0 for vcf_record in vcf_object.fetch(): line += 1 if line % 1000 == 0: self.logger.info("Processed {0} records...".format(line)) # Extract data data = self.extract( tumor_sample_id, normal_sample_id, tumor_idx, normal_idx, ann_cols_format, vep_key, vcf_record, is_tumor_only, ) # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ? if ( not data["selected_effect"]["IMPACT"] or data["selected_effect"]["One_Consequence"] == "?" ): self.logger.warn( "Skipping record with unknown impact or consequence: {0} - {1}".format( data["selected_effect"]["IMPACT"], data["selected_effect"]["One_Consequence"], ) ) continue # Transform maf_record = self.transform( vcf_record, data, is_tumor_only, line_number=line ) # Add to sorter sorter += maf_record # Write self.logger.info("Writing {0} sorted records...".format(line)) self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) counter = 0 for record in sorter: counter += 1 if counter % 1000 == 0: self.logger.info("Wrote {0} records...".format(counter)) self.maf_writer += record self.logger.info("Finished writing {0} records".format(counter)) finally: vcf_object.close() sorter.close() if self.maf_writer: self.maf_writer.close() for anno in self.annotators: if self.annotators[anno]: self.annotators[anno].shutdown() self.logger.info("Finished")
def do_work(self): """Main wrapper function for running public MAF filter""" self.logger.info("Processing input maf {0}...".format( self.options["input_maf"])) # Reader self.maf_reader = MafReader.reader_from( path=self.options["input_maf"], validation_stringency=ValidationStringency.Strict, ) # Header self.setup_maf_header() # Writer self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Counts processed = 0 hotspot_gdc_set = set(["gdc_pon", "common_in_gnomAD"]) nonexonic_set = set(["NonExonic"]) try: for record in self.maf_reader: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} records...".format(processed)) callers = record["callers"].value if (len(callers) >= self.options["min_callers"] and record["Mutation_Status"].value.value == "Somatic"): self.metrics.add_sample_swap_metric(record) gdc_filters = record["GDC_FILTER"].value gfset = set(gdc_filters) if self.is_hotspot(record): other_filts = gfset - hotspot_gdc_set if len(other_filts) == 0: self.write_record(record) elif len(other_filts - nonexonic_set ) == 0 and self.is_splice(record): # Rescue splicing if NonExonic self.write_record(record) # Rescue splicing if NonExonic elif len(gfset - nonexonic_set) == 0 and self.is_splice(record): self.write_record(record) elif not gfset: self.write_record(record) processed += 1 self.metrics.input_records += 1 self.logger.info("Processed {0} records.".format(processed)) print(json.dumps(self.metrics.to_json(), indent=2, sort_keys=True)) finally: self.maf_reader.close() self.maf_writer.close()