def __add_arguments__(cls, subparser): """ Add arguments to a subparser. """ sort_orders = [c.name() for c in SortOrder.all()] subparser.add_argument('-i', '--input', dest='input', required=True, help='A MAF file.') subparser.add_argument('-o', '--output', default=None, help="The output file, otherwise output will be" " to standard output.") subparser.add_argument('-s', '--sort-order', default=BarcodesAndCoordinate.name(), choices=sort_orders, help="The sort order to choose. " "Choices: %s" % ", ".join(sort_orders)) subparser.add_argument('-f', '--fasta-index', default=None, help="Use the FASTA index (fai) to order " "genomic coordinates.")
def test_sorter_with_sort_order_args(self): lines = [ "chr1\t248956422\t112\t70\t71" "chr2\t242193529\t252513167\t70\t71", "chr3\t198295559\t498166716\t70\t71", "chr4\t190214555\t699295181\t70\t71", "chr5\t181538259\t892227221\t70\t71", "chr6\t170805979\t1076358996\t70\t71", "chr7\t159345973\t1249605173\t70\t71", "chr8\t145138636\t1411227630\t70\t71", "chr9\t138394717\t1558439788\t70\t71", "chr10\t133797422\t1698811686\t70\t71", ] fd, fn = tmp_file(lines=lines) sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100, fasta_index=fn, ) self.__test_sorter(sorter=sorter, chromosome="chr5") with self.assertRaises(ValueError): self.__test_sorter(sorter=sorter, chromosome="1") fd.close() os.remove(fn)
def test_end_to_end(self): lines, header, records = self.read_test_maf() # reverse the lines input_lines = header + list(reversed(records)) subcommand_args = [ "--version", GdcV1_0_0_PublicScheme.version(), "--annotation", GdcV1_0_0_PublicScheme.annotation_spec() ] out_lines, stdout, stderr = run_main(subcommand="sort", lines=input_lines, subcommand_args=subcommand_args) out_records = [line for line in out_lines if not line.startswith("#")] # Check that we have the same # of records out_records = [line for line in out_lines \ if not line.startswith("#") and not line.startswith("Hugo_Symbol")] self.assertEqual(len(out_records), len(records)) # Check that we added the sort pragma sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, BarcodesAndCoordinate.name()) self.assertTrue(sortOrderLine in out_lines) self.assertEqual(len(out_lines) - 1, len(lines)) # added the pragma
def test_sorter_with_scheme(self): scheme = DummyScheme() sorter = MafSorter( sort_order_name=BarcodesAndCoordinate.name(), scheme=scheme, max_objects_in_ram=100, ) self.__test_sorter(sorter=sorter, with_scheme=True)
def test_with_fasta_index(self): # change the order of chromosomes! fasta_index_lines = [ "chr13\t114364328\t2106716512\t70\t71", "chr1\t248956422\t112\t70\t71" ] fd, fn = tmp_file(lines=fasta_index_lines) lines, header, records = self.read_test_maf() subcommand_args = [ "--version", GdcV1_0_0_PublicScheme.version(), "--annotation", GdcV1_0_0_PublicScheme.annotation_spec() ] out_lines, stdout, stderr = run_main(subcommand="sort", lines=lines, subcommand_args=subcommand_args) # Check that we have the same # of records out_records = [line for line in out_lines \ if not line.startswith("#") and not line.startswith("Hugo_Symbol")] self.assertEqual(len(out_records), len(records)) # Check that we added the sort pragma sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, BarcodesAndCoordinate.name()) self.assertTrue(sortOrderLine in out_lines) scheme = find_scheme( version=GdcV1_0_0_PublicScheme.version(), annotation=GdcV1_0_0_PublicScheme.annotation_spec()) # we should see chr13 before chr1 self.assertEqual(len(out_lines) - 1, len(lines)) # added the pragma found_chr1 = False for line in out_lines: if line.startswith(MafHeader.HeaderLineStartSymbol): continue record = MafRecord.from_line(line=line, scheme=scheme) self.assertFalse(record["Chromosome"] == "chr13" and found_chr1) found_chr1 = record["Chromosome"] == "chr1" fd.close() os.remove(fn)
def do_work(self): """Main wrapper function for running protect MAF merging""" # Reader self.load_readers() # Header self.setup_maf_header() self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) # Sorter sorter = MafSorter(max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), contigs=self.maf_header.contigs()) # Merger self._merger = MafRecordMerger_1_0_0(self._scheme) # Overlap iterator o_iter = LocatableOverlapIterator( self.maf_readers, contigs=self.maf_header.contigs(), peekable_iterator_class=FilteringPeekableIterator) # ndp filter ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth']) ndp_tag = ndp_filter.tags[0] # Counts processed = 0 try: for record in o_iter: if processed > 0 and processed % 1000 == 0: self.logger.info( "Processed {0} overlapping intervals...".format( processed)) result = OverlapSet(record, self.callers) for maf_record in self._merger.merge_records(result): if maf_record is not None: # Recheck normal depth gdc_filters = maf_record['GDC_FILTER'].value has_tag = ndp_tag in gdc_filters ndp = ndp_filter.filter(maf_record) if has_tag != ndp: if ndp: gdc_filters.extend(ndp_filter.tags) else: gdc_filters = list( filter(lambda x: x != ndp_filter.tags[0], gdc_filters)) maf_record["GDC_FILTER"] = get_builder( "GDC_FILTER", self._scheme, value=sorted(gdc_filters)) # Add to sorter sorter += maf_record processed += 1 self.logger.info( "Writing {0} sorted, merged records...".format(processed)) # Writer self.maf_writer = MafWriter.from_path( path=self.options['output_maf'], header=self.maf_header, validation_stringency=ValidationStringency.Strict) counter = 0 for record in sorter: if counter > 0 and counter % 1000 == 0: self.logger.info( "Wrote {0} sorted, merged records...".format(counter)) self.maf_writer += record counter += 1 self.logger.info( "Finished writing {0} sorted, merged records.".format(counter)) finally: for reader in self.maf_readers: reader.close() sorter.close() if self.maf_writer: self.maf_writer.close()
def do_work(self): """Main wrapper function for running vcf2maf""" self.logger.info( "Processing input vcf {0}...".format(self.options["input_vcf"]) ) # Initialize the maf file self.setup_maf_header() sorter = MafSorter( max_objects_in_ram=100000, sort_order_name=BarcodesAndCoordinate.name(), scheme=self.maf_header.scheme(), fasta_index=self.options["reference_fasta_index"], ) self._scheme = self.maf_header.scheme() self._columns = get_columns_from_header(self.maf_header) self._colset = set(self._columns) # Initialize vcf reader vcf_object = pysam.VariantFile(self.options["input_vcf"]) tumor_sample_id = self.options["tumor_vcf_id"] normal_sample_id = self.options["normal_vcf_id"] is_tumor_only = self.options["tumor_only"] try: # Validate samples tumor_idx = assert_sample_in_header( vcf_object, self.options["tumor_vcf_id"] ) normal_idx = assert_sample_in_header( vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only ) # extract annotation from header ann_cols_format, vep_key = extract_annotation_from_header( vcf_object, vep_key="CSQ" ) # Initialize annotators self.setup_annotators() # Initialize filters self.setup_filters() # Convert line = 0 for vcf_record in vcf_object.fetch(): line += 1 if line % 1000 == 0: self.logger.info("Processed {0} records...".format(line)) # Extract data data = self.extract( tumor_sample_id, normal_sample_id, tumor_idx, normal_idx, ann_cols_format, vep_key, vcf_record, is_tumor_only, ) # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ? if ( not data["selected_effect"]["IMPACT"] or data["selected_effect"]["One_Consequence"] == "?" ): self.logger.warn( "Skipping record with unknown impact or consequence: {0} - {1}".format( data["selected_effect"]["IMPACT"], data["selected_effect"]["One_Consequence"], ) ) continue # Transform maf_record = self.transform( vcf_record, data, is_tumor_only, line_number=line ) # Add to sorter sorter += maf_record # Write self.logger.info("Writing {0} sorted records...".format(line)) self.maf_writer = MafWriter.from_path( path=self.options["output_maf"], header=self.maf_header, validation_stringency=ValidationStringency.Strict, ) counter = 0 for record in sorter: counter += 1 if counter % 1000 == 0: self.logger.info("Wrote {0} records...".format(counter)) self.maf_writer += record self.logger.info("Finished writing {0} records".format(counter)) finally: vcf_object.close() sorter.close() if self.maf_writer: self.maf_writer.close() for anno in self.annotators: if self.annotators[anno]: self.annotators[anno].shutdown() self.logger.info("Finished")
def test_sorter_default(self): sorter = MafSorter(sort_order_name=BarcodesAndCoordinate.name(), max_objects_in_ram=100) self.__test_sorter(sorter=sorter)