def setup_maf_header(self): """ Sets up the maf header. """ # Reader header _hdr = MafHeader.from_reader(reader=self.maf_reader) if not self.options["reference_fasta_index"]: self.maf_header = MafHeader.from_defaults( version=self.options["version"], annotation=self.options["annotation"], sort_order=BarcodesAndCoordinate(), contigs=_hdr.contigs(), ) else: self.maf_header = MafHeader.from_defaults( version=self.options["version"], annotation=self.options["annotation"], sort_order=BarcodesAndCoordinate(), fasta_index=self.options["reference_fasta_index"], ) self.maf_header.validation_stringency = ValidationStringency.Strict header_date = BaseRunner.get_header_date() self.maf_header[header_date.key] = header_date try: nkey = _hdr["normal.aliquot"] self.maf_header["normal.aliquot"] = nkey except KeyError as e: if not self.options["tumor_only"]: raise e tkey = _hdr["tumor.aliquot"] self.maf_header["tumor.aliquot"] = tkey
def __add_arguments__(cls, subparser): """ Add arguments to a subparser. """ sort_orders = [c.name() for c in SortOrder.all()] subparser.add_argument('-i', '--input', dest='input', required=True, help='A MAF file.') subparser.add_argument('-o', '--output', default=None, help="The output file, otherwise output will be" " to standard output.") subparser.add_argument('-s', '--sort-order', default=BarcodesAndCoordinate.name(), choices=sort_orders, help="The sort order to choose. " "Choices: %s" % ", ".join(sort_orders)) subparser.add_argument('-f', '--fasta-index', default=None, help="Use the FASTA index (fai) to order " "genomic coordinates.")
def __init__(self, iters, fasta_index=None, contigs=None, by_barcodes=True, peekable_iterator_class=PeekableIterator): """ :param iters: the list of iterators. :param fasta_index: the path to the FASTA index for defining ordering across chromosomes. :param contigs: the list of contigs to use for sorting instead of parsing from FASTA index. :param by_barcodes: True to require the same tumor and matched normal barcodes for returned locatables, False otherwise :param peekable_iterator_class: PeekableIterator class to use when traversing individual MAFs. This allows developers to add in custom filters and custom handling of MAFs. """ self._by_barcodes = by_barcodes if self._by_barcodes: self._sort_order = BarcodesAndCoordinate(fasta_index=fasta_index, contigs=contigs) self._overlap_f = self.__overlaps_with_barcode else: self._sort_order = Coordinate(fasta_index=fasta_index) self._overlap_f = self.__overlaps # Trust, but verify _iters = [_SortOrderEnforcingIterator(_iter, self._sort_order) for _iter in iters] self._iters = [peekable_iterator_class(_iter) for _iter in _iters] self._sort_key = self._sort_order.sort_key()
def setup_maf_header(self): """ Sets up the maf header. """ self.maf_header = MafHeader.from_defaults( version=self.options["version"], annotation=self.options["annotation"], sort_order=BarcodesAndCoordinate(), fasta_index=self.options["reference_fasta_index"], ) header_date = BaseRunner.get_header_date() self.maf_header[header_date.key] = header_date if not self.options["tumor_only"]: normal_aliquot = MafHeaderRecord( key="normal.aliquot", value=self.options["normal_aliquot_uuid"] if not self.options["tumor_only"] else "", ) self.maf_header[normal_aliquot.key] = normal_aliquot tumor_aliquot = MafHeaderRecord( key="tumor.aliquot", value=self.options["tumor_aliquot_uuid"] ) self.maf_header[tumor_aliquot.key] = tumor_aliquot
def test_sorter_with_sort_order_args(self): lines = [ "chr1\t248956422\t112\t70\t71" "chr2\t242193529\t252513167\t70\t71", "chr3\t198295559\t498166716\t70\t71", "chr4\t190214555\t699295181\t70\t71", "chr5\t181538259\t892227221\t70\t71", "chr6\t170805979\t1076358996\t70\t71", "chr7\t159345973\t1249605173\t70\t71", "chr8\t145138636\t1411227630\t70\t71", "chr9\t138394717\t1558439788\t70\t71", "chr10\t133797422\t1698811686\t70\t71", ] fd, fn = tmp_file(lines=lines) sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100, fasta_index=fn, ) self.__test_sorter(sorter=sorter, chromosome="chr5") with self.assertRaises(ValueError): self.__test_sorter(sorter=sorter, chromosome="1") fd.close() os.remove(fn)
def test_end_to_end(self): lines, header, records = self.read_test_maf() # reverse the lines input_lines = header + list(reversed(records)) subcommand_args = [ "--version", GdcV1_0_0_PublicScheme.version(), "--annotation", GdcV1_0_0_PublicScheme.annotation_spec() ] out_lines, stdout, stderr = run_main(subcommand="sort", lines=input_lines, subcommand_args=subcommand_args) out_records = [line for line in out_lines if not line.startswith("#")] # Check that we have the same # of records out_records = [line for line in out_lines \ if not line.startswith("#") and not line.startswith("Hugo_Symbol")] self.assertEqual(len(out_records), len(records)) # Check that we added the sort pragma sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, BarcodesAndCoordinate.name()) self.assertTrue(sortOrderLine in out_lines) self.assertEqual(len(out_lines) - 1, len(lines)) # added the pragma
def test_sorter_with_scheme(self): scheme = DummyScheme() sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), scheme=scheme, max_objects_in_ram=100, ) self.__test_sorter(sorter=sorter, with_scheme=True)
def __init__( self, iters: List[MafReader], fasta_index: Optional[str] = None, contigs: List[str] = None, by_barcodes: bool = True, peekable_iterator_class: Type[PeekableIterator] = PeekableIterator, ): """ :param iters: the list of iterators. :param fasta_index: the path to the FASTA index for defining ordering across chromosomes. :param contigs: the list of contigs to use for sorting instead of parsing from FASTA index. :param by_barcodes: True to require the same tumor and matched normal barcodes for returned locatables, False otherwise :param peekable_iterator_class: PeekableIterator class to use when traversing individual MAFs. This allows developers to add in custom filters and custom handling of MAFs. """ self._overlap_f: Union[Callable[ [_BarcodesAndCoordinateKey, _BarcodesAndCoordinateKey], bool], Callable[[Locatable, Locatable], bool], ] if not by_barcodes: _sort_order = Coordinate(fasta_index=fasta_index) self._overlap_f = self.__overlaps else: _sort_order = BarcodesAndCoordinate(fasta_index=fasta_index, contigs=contigs) self._overlap_f = self.__overlaps_with_barcode self._sort_order: Coordinate = _sort_order self._by_barcodes: bool = by_barcodes # Trust, but verify _iters = [ _SortOrderEnforcingIterator(_iter, self._sort_order) for _iter in iters ] self._iters: List[PeekableIterator] = [ peekable_iterator_class(_iter) for _iter in _iters ] self._sort_key: TSortKey = self._sort_order.sort_key()
def test_less_than_max_in_memory(self): max_objects_in_ram = 100 num_records = max_objects_in_ram - 1 sorter = Sorter(max_objects_in_ram, self.codec(), BarcodesAndCoordinate().sort_key(), always_spill=False) # add them in reverse order for i in range(num_records): record = DummyRecord("A", "B", "C", 1, num_records - i - 1) sorter += record records = [r for r in sorter] sorter.close() self.assertEqual(len(records), num_records) for i in range(num_records): record = records[i] self.assertEqual(record.value("End_Position"), i)
def test_with_fasta_index(self): # change the order of chromosomes! fasta_index_lines = [ "chr13\t114364328\t2106716512\t70\t71", "chr1\t248956422\t112\t70\t71" ] fd, fn = tmp_file(lines=fasta_index_lines) lines, header, records = self.read_test_maf() subcommand_args = [ "--version", GdcV1_0_0_PublicScheme.version(), "--annotation", GdcV1_0_0_PublicScheme.annotation_spec() ] out_lines, stdout, stderr = run_main(subcommand="sort", lines=lines, subcommand_args=subcommand_args) # Check that we have the same # of records out_records = [line for line in out_lines \ if not line.startswith("#") and not line.startswith("Hugo_Symbol")] self.assertEqual(len(out_records), len(records)) # Check that we added the sort pragma sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, BarcodesAndCoordinate.name()) self.assertTrue(sortOrderLine in out_lines) scheme = find_scheme( version=GdcV1_0_0_PublicScheme.version(), annotation=GdcV1_0_0_PublicScheme.annotation_spec()) # we should see chr13 before chr1 self.assertEqual(len(out_lines) - 1, len(lines)) # added the pragma found_chr1 = False for line in out_lines: if line.startswith(MafHeader.HeaderLineStartSymbol): continue record = MafRecord.from_line(line=line, scheme=scheme) self.assertFalse(record["Chromosome"] == "chr13" and found_chr1) found_chr1 = record["Chromosome"] == "chr1" fd.close() os.remove(fn)
def setup_maf_header(self): """ Sets up the maf header. """ # Reader header _hdr = MafHeader.from_reader(reader=self.maf_readers[0]) self.maf_header = MafHeader.from_defaults( version=self.options['version'], annotation=self.options['annotation'], sort_order=BarcodesAndCoordinate(), contigs=_hdr.contigs()) self.maf_header.validation_stringency = ValidationStringency.Strict header_date = BaseRunner.get_header_date() self.maf_header[header_date.key] = header_date nkey = _hdr["normal.aliquot"] self.maf_header["normal.aliquot"] = nkey tkey = _hdr["tumor.aliquot"] self.maf_header["tumor.aliquot"] = tkey
def do_work(self): """Main wrapper function for running protect MAF merging""" # Reader self.load_readers() # Header self.setup_maf_header() self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) # Sorter sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), contigs=self.maf_header.contigs()) # Merger self._merger = MafRecordMerger_1_0_0(self._scheme) # Overlap iterator o_iter = LocatableOverlapIterator( self.maf_readers, contigs=self.maf_header.contigs(), peekable_iterator_class=FilteringPeekableIterator) # ndp filter ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth']) ndp_tag = ndp_filter.tags[0] # Counts processed = 0 try: for record in o_iter: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} overlapping intervals...".format( processed)) result = OverlapSet(record, self.callers) for maf_record in self._merger.merge_records(result): if maf_record is not None: # Recheck normal depth gdc_filters = maf_record['GDC_FILTER'].value has_tag = ndp_tag in gdc_filters ndp = ndp_filter.filter(maf_record) if has_tag != ndp: if ndp: gdc_filters.extend(ndp_filter.tags) else: gdc_filters = list( filter(lambda x: x != ndp_filter.tags[0], gdc_filters)) maf_record["GDC_FILTER"] = get_builder( "GDC_FILTER", self._scheme, value=sorted(gdc_filters)) # Add to sorter sorter += maf_record processed += 1 self.logger.info( "Writing {0} sorted, merged records...".format(processed)) # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) counter = 0 for record in sorter: if counter > 0 and counter % 1000 == 0: self.logger.info( "Wrote {0} sorted, merged records...".format(counter)) self.maf_writer += record counter += 1 self.logger.info( "Finished writing {0} sorted, merged records.".format(counter)) finally: for reader in self.maf_readers: reader.close() sorter.close() if self.maf_writer: self.maf_writer.close()
def do_work(self): """Main wrapper function for running vcf2maf""" self.logger.info( "Processing input vcf {0}...".format(self.options["input_vcf"]) ) # Initialize the maf file self.setup_maf_header() sorter = MafSorter( max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), fasta_index=self.options["reference_fasta_index"], ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Initialize vcf reader vcf_object = pysam.VariantFile(self.options["input_vcf"]) tumor_sample_id = self.options["tumor_vcf_id"] normal_sample_id = self.options["normal_vcf_id"] is_tumor_only = self.options["tumor_only"] try: # Validate samples tumor_idx = assert_sample_in_header( vcf_object, self.options["tumor_vcf_id"] ) normal_idx = assert_sample_in_header( vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only ) # extract annotation from header ann_cols_format, vep_key = extract_annotation_from_header( vcf_object, vep_key="CSQ" ) # Initialize annotators self.setup_annotators() # Initialize filters self.setup_filters() # Convert line = 0 for vcf_record in vcf_object.fetch(): line += 1 if line % 1000 == 0: self.logger.info("Processed {0} records...".format(line)) # Extract data data = self.extract( tumor_sample_id, normal_sample_id, tumor_idx, normal_idx, ann_cols_format, vep_key, vcf_record, is_tumor_only, ) # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ? if ( not data["selected_effect"]["IMPACT"] or data["selected_effect"]["One_Consequence"] == "?" ): self.logger.warn( "Skipping record with unknown impact or consequence: {0} - {1}".format( data["selected_effect"]["IMPACT"], data["selected_effect"]["One_Consequence"], ) ) continue # Transform maf_record = self.transform( vcf_record, data, is_tumor_only, line_number=line ) # Add to sorter sorter += maf_record # Write self.logger.info("Writing {0} sorted records...".format(line)) self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) counter = 0 for record in sorter: counter += 1 if counter % 1000 == 0: self.logger.info("Wrote {0} records...".format(counter)) self.maf_writer += record self.logger.info("Finished writing {0} records".format(counter)) finally: vcf_object.close() sorter.close() if self.maf_writer: self.maf_writer.close() for anno in self.annotators: if self.annotators[anno]: self.annotators[anno].shutdown() self.logger.info("Finished")
def test_empty(self): sorter = Sorter(100, self.codec(), BarcodesAndCoordinate().sort_key()) records = [r for r in sorter] sorter.close() self.assertEqual(len(records), 0)
def test_sorter_default(self): sorter = MafSorter(sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100) self.__test_sorter(sorter=sorter)