예제 #1
0
    def setup_maf_header(self):
        """
        Sets up the maf header.
        """
        # Reader header
        _hdr = MafHeader.from_reader(reader=self.maf_reader)

        if not self.options["reference_fasta_index"]:
            self.maf_header = MafHeader.from_defaults(
                version=self.options["version"],
                annotation=self.options["annotation"],
                sort_order=BarcodesAndCoordinate(),
                contigs=_hdr.contigs(),
            )
        else:
            self.maf_header = MafHeader.from_defaults(
                version=self.options["version"],
                annotation=self.options["annotation"],
                sort_order=BarcodesAndCoordinate(),
                fasta_index=self.options["reference_fasta_index"],
            )
        self.maf_header.validation_stringency = ValidationStringency.Strict

        header_date = BaseRunner.get_header_date()
        self.maf_header[header_date.key] = header_date

        try:
            nkey = _hdr["normal.aliquot"]
            self.maf_header["normal.aliquot"] = nkey
        except KeyError as e:
            if not self.options["tumor_only"]:
                raise e

        tkey = _hdr["tumor.aliquot"]
        self.maf_header["tumor.aliquot"] = tkey
예제 #2
0
파일: sort.py 프로젝트: MarcSaric/maf-lib
 def __add_arguments__(cls, subparser):
     """
     Add arguments to a subparser.
     """
     sort_orders = [c.name() for c in SortOrder.all()]
     subparser.add_argument('-i',
                            '--input',
                            dest='input',
                            required=True,
                            help='A MAF file.')
     subparser.add_argument('-o',
                            '--output',
                            default=None,
                            help="The output file, otherwise output will be"
                            " to standard output.")
     subparser.add_argument('-s',
                            '--sort-order',
                            default=BarcodesAndCoordinate.name(),
                            choices=sort_orders,
                            help="The sort order to choose.  "
                            "Choices: %s" % ", ".join(sort_orders))
     subparser.add_argument('-f',
                            '--fasta-index',
                            default=None,
                            help="Use the FASTA index (fai) to order "
                            "genomic coordinates.")
예제 #3
0
    def __init__(self,
                 iters,
                 fasta_index=None,
                 contigs=None,
                 by_barcodes=True,
                 peekable_iterator_class=PeekableIterator):
        """
        :param iters: the list of iterators.
        :param fasta_index: the path to the FASTA index for defining 
        ordering across chromosomes.
        :param contigs: the list of contigs to use for sorting instead of
        parsing from FASTA index.
        :param by_barcodes: True to require the same tumor and matched 
        normal barcodes for returned locatables, False otherwise
        :param peekable_iterator_class: PeekableIterator class to use when
        traversing individual MAFs. This allows developers to add in custom
        filters and custom handling of MAFs.
        """

        self._by_barcodes = by_barcodes
        if self._by_barcodes:
            self._sort_order = BarcodesAndCoordinate(fasta_index=fasta_index, contigs=contigs)
            self._overlap_f = self.__overlaps_with_barcode
        else:
            self._sort_order = Coordinate(fasta_index=fasta_index)
            self._overlap_f = self.__overlaps

        # Trust, but verify
        _iters = [_SortOrderEnforcingIterator(_iter, self._sort_order)
                  for _iter in iters]
        self._iters = [peekable_iterator_class(_iter) for _iter in _iters]

        self._sort_key = self._sort_order.sort_key()
    def setup_maf_header(self):
        """
        Sets up the maf header.
        """
        self.maf_header = MafHeader.from_defaults(
            version=self.options["version"],
            annotation=self.options["annotation"],
            sort_order=BarcodesAndCoordinate(),
            fasta_index=self.options["reference_fasta_index"],
        )

        header_date = BaseRunner.get_header_date()
        self.maf_header[header_date.key] = header_date

        if not self.options["tumor_only"]:
            normal_aliquot = MafHeaderRecord(
                key="normal.aliquot",
                value=self.options["normal_aliquot_uuid"]
                if not self.options["tumor_only"]
                else "",
            )
            self.maf_header[normal_aliquot.key] = normal_aliquot

        tumor_aliquot = MafHeaderRecord(
            key="tumor.aliquot", value=self.options["tumor_aliquot_uuid"]
        )
        self.maf_header[tumor_aliquot.key] = tumor_aliquot
예제 #5
0
    def test_sorter_with_sort_order_args(self):
        lines = [
            "chr1\t248956422\t112\t70\t71"
            "chr2\t242193529\t252513167\t70\t71",
            "chr3\t198295559\t498166716\t70\t71",
            "chr4\t190214555\t699295181\t70\t71",
            "chr5\t181538259\t892227221\t70\t71",
            "chr6\t170805979\t1076358996\t70\t71",
            "chr7\t159345973\t1249605173\t70\t71",
            "chr8\t145138636\t1411227630\t70\t71",
            "chr9\t138394717\t1558439788\t70\t71",
            "chr10\t133797422\t1698811686\t70\t71",
        ]
        fd, fn = tmp_file(lines=lines)

        sorter = MafSorter(
            sort_order_name=BarcodesAndCoordinate.name(),
            max_objects_in_ram=100,
            fasta_index=fn,
        )

        self.__test_sorter(sorter=sorter, chromosome="chr5")

        with self.assertRaises(ValueError):
            self.__test_sorter(sorter=sorter, chromosome="1")

        fd.close()
        os.remove(fn)
예제 #6
0
    def test_end_to_end(self):
        lines, header, records = self.read_test_maf()

        # reverse the lines
        input_lines = header + list(reversed(records))
        subcommand_args = [
            "--version",
            GdcV1_0_0_PublicScheme.version(), "--annotation",
            GdcV1_0_0_PublicScheme.annotation_spec()
        ]
        out_lines, stdout, stderr = run_main(subcommand="sort",
                                             lines=input_lines,
                                             subcommand_args=subcommand_args)
        out_records = [line for line in out_lines if not line.startswith("#")]

        # Check that we have the same # of records
        out_records = [line for line in out_lines \
                       if not line.startswith("#") and not line.startswith("Hugo_Symbol")]
        self.assertEqual(len(out_records), len(records))

        # Check that we added the sort pragma
        sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol,
                                     MafHeader.SortOrderKey,
                                     BarcodesAndCoordinate.name())
        self.assertTrue(sortOrderLine in out_lines)

        self.assertEqual(len(out_lines) - 1, len(lines))  # added the pragma
예제 #7
0
 def test_sorter_with_scheme(self):
     scheme = DummyScheme()
     sorter = MafSorter(
         sort_order_name=BarcodesAndCoordinate.name(),
         scheme=scheme,
         max_objects_in_ram=100,
     )
     self.__test_sorter(sorter=sorter, with_scheme=True)
예제 #8
0
    def __init__(
        self,
        iters: List[MafReader],
        fasta_index: Optional[str] = None,
        contigs: List[str] = None,
        by_barcodes: bool = True,
        peekable_iterator_class: Type[PeekableIterator] = PeekableIterator,
    ):
        """
        :param iters: the list of iterators.
        :param fasta_index: the path to the FASTA index for defining
        ordering across chromosomes.
        :param contigs: the list of contigs to use for sorting instead of
        parsing from FASTA index.
        :param by_barcodes: True to require the same tumor and matched
        normal barcodes for returned locatables, False otherwise
        :param peekable_iterator_class: PeekableIterator class to use when
        traversing individual MAFs. This allows developers to add in custom
        filters and custom handling of MAFs.
        """

        self._overlap_f: Union[Callable[
            [_BarcodesAndCoordinateKey, _BarcodesAndCoordinateKey], bool],
                               Callable[[Locatable, Locatable], bool], ]
        if not by_barcodes:
            _sort_order = Coordinate(fasta_index=fasta_index)
            self._overlap_f = self.__overlaps
        else:
            _sort_order = BarcodesAndCoordinate(fasta_index=fasta_index,
                                                contigs=contigs)
            self._overlap_f = self.__overlaps_with_barcode
        self._sort_order: Coordinate = _sort_order
        self._by_barcodes: bool = by_barcodes

        # Trust, but verify
        _iters = [
            _SortOrderEnforcingIterator(_iter, self._sort_order)
            for _iter in iters
        ]
        self._iters: List[PeekableIterator] = [
            peekable_iterator_class(_iter) for _iter in _iters
        ]

        self._sort_key: TSortKey = self._sort_order.sort_key()
예제 #9
0
    def test_less_than_max_in_memory(self):
        max_objects_in_ram = 100
        num_records = max_objects_in_ram - 1
        sorter = Sorter(max_objects_in_ram,
                        self.codec(),
                        BarcodesAndCoordinate().sort_key(),
                        always_spill=False)

        # add them in reverse order
        for i in range(num_records):
            record = DummyRecord("A", "B", "C", 1, num_records - i - 1)
            sorter += record
        records = [r for r in sorter]
        sorter.close()
        self.assertEqual(len(records), num_records)

        for i in range(num_records):
            record = records[i]
            self.assertEqual(record.value("End_Position"), i)
예제 #10
0
    def test_with_fasta_index(self):
        # change the order of chromosomes!
        fasta_index_lines = [
            "chr13\t114364328\t2106716512\t70\t71",
            "chr1\t248956422\t112\t70\t71"
        ]
        fd, fn = tmp_file(lines=fasta_index_lines)
        lines, header, records = self.read_test_maf()
        subcommand_args = [
            "--version",
            GdcV1_0_0_PublicScheme.version(), "--annotation",
            GdcV1_0_0_PublicScheme.annotation_spec()
        ]
        out_lines, stdout, stderr = run_main(subcommand="sort",
                                             lines=lines,
                                             subcommand_args=subcommand_args)

        # Check that we have the same # of records
        out_records = [line for line in out_lines \
                       if not line.startswith("#") and not line.startswith("Hugo_Symbol")]
        self.assertEqual(len(out_records), len(records))

        # Check that we added the sort pragma
        sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol,
                                     MafHeader.SortOrderKey,
                                     BarcodesAndCoordinate.name())
        self.assertTrue(sortOrderLine in out_lines)

        scheme = find_scheme(
            version=GdcV1_0_0_PublicScheme.version(),
            annotation=GdcV1_0_0_PublicScheme.annotation_spec())
        # we should see chr13 before chr1
        self.assertEqual(len(out_lines) - 1, len(lines))  # added the pragma
        found_chr1 = False
        for line in out_lines:
            if line.startswith(MafHeader.HeaderLineStartSymbol):
                continue
            record = MafRecord.from_line(line=line, scheme=scheme)
            self.assertFalse(record["Chromosome"] == "chr13" and found_chr1)
            found_chr1 = record["Chromosome"] == "chr1"
        fd.close()
        os.remove(fn)
    def setup_maf_header(self):
        """
        Sets up the maf header.
        """
        # Reader header
        _hdr = MafHeader.from_reader(reader=self.maf_readers[0])

        self.maf_header = MafHeader.from_defaults(
            version=self.options['version'],
            annotation=self.options['annotation'],
            sort_order=BarcodesAndCoordinate(),
            contigs=_hdr.contigs())
        self.maf_header.validation_stringency = ValidationStringency.Strict

        header_date = BaseRunner.get_header_date()
        self.maf_header[header_date.key] = header_date

        nkey = _hdr["normal.aliquot"]
        self.maf_header["normal.aliquot"] = nkey
        tkey = _hdr["tumor.aliquot"]
        self.maf_header["tumor.aliquot"] = tkey
    def do_work(self):
        """Main wrapper function for running protect MAF merging"""

        # Reader
        self.load_readers()

        # Header
        self.setup_maf_header()

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)

        # Sorter
        sorter = MafSorter(max_objects_in_ram=100000,
                           sort_order_name=BarcodesAndCoordinate.name(),
                           scheme=self.maf_header.scheme(),
                           contigs=self.maf_header.contigs())

        # Merger
        self._merger = MafRecordMerger_1_0_0(self._scheme)

        # Overlap iterator
        o_iter = LocatableOverlapIterator(
            self.maf_readers,
            contigs=self.maf_header.contigs(),
            peekable_iterator_class=FilteringPeekableIterator)

        # ndp filter
        ndp_filter = Filters.NormalDepth.setup(self.options['min_n_depth'])
        ndp_tag = ndp_filter.tags[0]

        # Counts
        processed = 0
        try:
            for record in o_iter:

                if processed > 0 and processed % 1000 == 0:
                    self.logger.info(
                        "Processed {0} overlapping intervals...".format(
                            processed))

                result = OverlapSet(record, self.callers)

                for maf_record in self._merger.merge_records(result):
                    if maf_record is not None:
                        # Recheck normal depth
                        gdc_filters = maf_record['GDC_FILTER'].value
                        has_tag = ndp_tag in gdc_filters
                        ndp = ndp_filter.filter(maf_record)
                        if has_tag != ndp:
                            if ndp:
                                gdc_filters.extend(ndp_filter.tags)
                            else:
                                gdc_filters = list(
                                    filter(lambda x: x != ndp_filter.tags[0],
                                           gdc_filters))

                            maf_record["GDC_FILTER"] = get_builder(
                                "GDC_FILTER",
                                self._scheme,
                                value=sorted(gdc_filters))

                        # Add to sorter
                        sorter += maf_record

                processed += 1

            self.logger.info(
                "Writing {0} sorted, merged records...".format(processed))

            # Writer
            self.maf_writer = MafWriter.from_path(
                path=self.options['output_maf'],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict)

            counter = 0
            for record in sorter:
                if counter > 0 and counter % 1000 == 0:
                    self.logger.info(
                        "Wrote {0} sorted, merged records...".format(counter))
                self.maf_writer += record
                counter += 1

            self.logger.info(
                "Finished writing {0} sorted, merged records.".format(counter))

        finally:
            for reader in self.maf_readers:
                reader.close()

            sorter.close()

            if self.maf_writer:
                self.maf_writer.close()
예제 #13
0
    def do_work(self):
        """Main wrapper function for running vcf2maf"""
        self.logger.info(
            "Processing input vcf {0}...".format(self.options["input_vcf"])
        )

        # Initialize the maf file
        self.setup_maf_header()

        sorter = MafSorter(
            max_objects_in_ram=100000,
            sort_order_name=BarcodesAndCoordinate.name(),
            scheme=self.maf_header.scheme(),
            fasta_index=self.options["reference_fasta_index"],
        )

        self._scheme = self.maf_header.scheme()
        self._columns = get_columns_from_header(self.maf_header)
        self._colset = set(self._columns)

        # Initialize vcf reader
        vcf_object = pysam.VariantFile(self.options["input_vcf"])
        tumor_sample_id = self.options["tumor_vcf_id"]
        normal_sample_id = self.options["normal_vcf_id"]
        is_tumor_only = self.options["tumor_only"]

        try:
            # Validate samples
            tumor_idx = assert_sample_in_header(
                vcf_object, self.options["tumor_vcf_id"]
            )
            normal_idx = assert_sample_in_header(
                vcf_object, self.options["normal_vcf_id"], can_fail=is_tumor_only
            )

            # extract annotation from header
            ann_cols_format, vep_key = extract_annotation_from_header(
                vcf_object, vep_key="CSQ"
            )

            # Initialize annotators
            self.setup_annotators()

            # Initialize filters
            self.setup_filters()

            # Convert
            line = 0
            for vcf_record in vcf_object.fetch():

                line += 1

                if line % 1000 == 0:
                    self.logger.info("Processed {0} records...".format(line))

                # Extract data
                data = self.extract(
                    tumor_sample_id,
                    normal_sample_id,
                    tumor_idx,
                    normal_idx,
                    ann_cols_format,
                    vep_key,
                    vcf_record,
                    is_tumor_only,
                )

                # Skip rare occasions where VEP doesn't provide IMPACT or the consequence is ?
                if (
                    not data["selected_effect"]["IMPACT"]
                    or data["selected_effect"]["One_Consequence"] == "?"
                ):
                    self.logger.warn(
                        "Skipping record with unknown impact or consequence: {0} - {1}".format(
                            data["selected_effect"]["IMPACT"],
                            data["selected_effect"]["One_Consequence"],
                        )
                    )
                    continue

                # Transform
                maf_record = self.transform(
                    vcf_record, data, is_tumor_only, line_number=line
                )

                # Add to sorter
                sorter += maf_record

            # Write
            self.logger.info("Writing {0} sorted records...".format(line))
            self.maf_writer = MafWriter.from_path(
                path=self.options["output_maf"],
                header=self.maf_header,
                validation_stringency=ValidationStringency.Strict,
            )

            counter = 0
            for record in sorter:

                counter += 1

                if counter % 1000 == 0:
                    self.logger.info("Wrote {0} records...".format(counter))

                self.maf_writer += record

            self.logger.info("Finished writing {0} records".format(counter))

        finally:
            vcf_object.close()
            sorter.close()
            if self.maf_writer:
                self.maf_writer.close()
            for anno in self.annotators:
                if self.annotators[anno]:
                    self.annotators[anno].shutdown()

        self.logger.info("Finished")
예제 #14
0
 def test_empty(self):
     sorter = Sorter(100, self.codec(), BarcodesAndCoordinate().sort_key())
     records = [r for r in sorter]
     sorter.close()
     self.assertEqual(len(records), 0)
예제 #15
0
 def test_sorter_default(self):
     sorter = MafSorter(sort_order_name=BarcodesAndCoordinate.name(),
                        max_objects_in_ram=100)
     self.__test_sorter(sorter=sorter)