Exemplo n.º 1
0
def evaluate_performance_metrics_and_write_results(
        truth_calls: str, ref_dict_file: str, gcnv_segment_vcfs: list,
        padded_interval_file: str, blacklisted_intervals_truth: str,
        callset_filter_names: list, callset_filter_max_values: list,
        callset_filter_num_bins: list, attribute_for_roc_creation: str,
        output_dir: str, truth_allele_frequency_threshold: float):
    io_plt.log("Reading in callsets.")
    ref_dict = ReferenceDictionary.read_in(ref_dict_file)
    gcnv_callset = GCNVCallset.read_in_callset(
        gcnv_segment_vcfs=gcnv_segment_vcfs, reference_dictionary=ref_dict)
    truth_callset = TruthCallset.read_in_callset(
        truth_file=truth_calls,
        interval_file=padded_interval_file,
        reference_dictionary=ref_dict,
        allele_frequency_threshold=truth_allele_frequency_threshold)
    considered_intervals = IntervalCollection.read_interval_list(
        padded_interval_file)
    blacklisted_intervals_truth = IntervalCollection.read_interval_list(
        blacklisted_intervals_truth)
    io_plt.log("Evaluating the callset against the truth.")
    evaluator = Evaluator(
        evaluation_name="test_eval",
        considered_intervals=considered_intervals,
        blacklisted_intervals_truth=blacklisted_intervals_truth)
    result = evaluator.evaluate_callset(
        callset_truth=truth_callset,
        callset_to_evaluate=gcnv_callset,
        callset_filter_names=callset_filter_names,
        callset_filter_max_values=callset_filter_max_values,
        callset_filter_num_bins=callset_filter_num_bins)

    result.compute_f1_measures()
    result.write_performance_curves_to_file(output_dir,
                                            attribute_for_roc_creation)
    result.write_results(output_dir)
Exemplo n.º 2
0
def test_truth_callset():
    interval_collection = IntervalCollection.read_interval_list(
        ANALYZED_INTERVALS)
    # Test callset parsing
    truth_callset_actual = TruthCallset.read_in_callset(
        truth_callset_bed_file=TRUTH_CALLSET_TEST_BED,
        interval_collection=interval_collection,
        samples_to_keep=SAMPLES_TO_KEEP)
    assert truth_callset_actual.truth_callset_pyrange.df.equals(
        TRUTH_CALLSET_TEST_PYRANGE_EXPECTED.df)
    # Test filtering

    truth_callset_actual.filter_out_uncovered_events(interval_collection,
                                                     min_overlap_fraction=0.3)
    assert truth_callset_actual.truth_callset_pyrange.df.equals(
        TRUTH_CALLSET_TEST_FILTERED_PYRANGE_EXPECTED.df)

    for s in truth_callset_actual.sample_set:
        assert truth_callset_actual.sample_to_pyrange_map[s].df.equals(
            TRUTH_CALLSET_SAMPLE_TO_PYRANGE_MAP[s].df)

    rare_intervals_subset_actual = truth_callset_actual.subset_intervals_to_rare_regions(
        interval_collection, max_allelic_fraction=0.5)
    assert rare_intervals_subset_actual.pyrange.df.equals(
        TRUTH_CALLSET_RARE_INTERVALS_SUBSET_PYRANGE_EXPECTED.df)
Exemplo n.º 3
0
def test_bin_evaluator():
    interval_collection = IntervalCollection.read_interval_list(ANALYZED_INTERVALS)
    gcnv_callset = GCNVCallset.read_in_callset(gcnv_segment_vcfs=GCNV_CALLSET_TEST_VCF_LIST)
    truth_callset = TruthCallset.read_in_callset(truth_callset_bed_file=TRUTH_CALLSET_TEST_BED,
                                                 interval_collection=interval_collection,
                                                 samples_to_keep=gcnv_callset.sample_set)

    evaluator = PerBinEvaluator(truth_callset=truth_callset, interval_collection=interval_collection)
    evaluation_result_actual = evaluator.evaluate_callset_against_the_truth(gcnv_callset, 4)
    assert evaluation_result_actual == PER_BIN_EVALUATION_RESULT_EXPECTED
Exemplo n.º 4
0
def test_event_evaluator():
    interval_collection = IntervalCollection.read_interval_list(ANALYZED_INTERVALS)
    gcnv_callset = GCNVCallset.read_in_callset(gcnv_segment_vcfs=GCNV_CALLSET_TEST_VCF_LIST)
    truth_callset = TruthCallset.read_in_callset(truth_callset_bed_file=TRUTH_CALLSET_TEST_BED,
                                                 interval_collection=interval_collection,
                                                 samples_to_keep=gcnv_callset.sample_set)
    truth_callset.filter_out_uncovered_events(interval_collection, min_overlap_fraction=0.3)
    evaluator = PerEventEvaluator(truth_callset=truth_callset)
    evaluation_result_actual = evaluator.evaluate_callset_against_the_truth(gcnv_callset=gcnv_callset, minimum_reciprocal_overlap=0.4)
    assert evaluation_result_actual == PER_EVENT_EVALUATION_RESULT_EXPECTED
Exemplo n.º 5
0
def test_sample_by_interval_matrix():
    interval_collection = IntervalCollection.read_interval_list(
        ANALYZED_INTERVALS)
    # Test callset parsing
    truth_callset = TruthCallset.read_in_callset(
        truth_callset_bed_file=TRUTH_CALLSET_TEST_BED,
        interval_collection=interval_collection,
        samples_to_keep=SAMPLES_TO_KEEP)
    callset_matrix_view_actual = truth_callset.get_callset_matrix_view(
        interval_collection, list(truth_callset.sample_set))
    assert callset_matrix_view_actual == TRUTH_CALLSET_CALLSET_MATRIX_VIEW_EXPECTED

    gcnv_callset = GCNVCallset.read_in_callset(
        gcnv_segment_vcfs=[GCNV_CALLSET_MATRIX_TEST_VCF])
    gcnv_callset_matrix_view_actual = gcnv_callset.get_callset_matrix_view(
        interval_collection, list(gcnv_callset.sample_set))
    assert gcnv_callset_matrix_view_actual == GCNV_CALLSET_MATRIX_VIEW_EXPECTED
def test_interval_collection_class(temp_dir: str):
    interval1_1001_2000 = Interval("1", 1001, 2000)
    interval1_2001_3000 = Interval("1", 2001, 3000)
    interval1_3001_4000 = Interval("1", 3001, 4000)
    interval1_10000_20000 = Interval("1", 10000, 20000)
    interval2_1001_2000 = Interval("2", 1001, 2000)
    interval2_2001_3000 = Interval("2", 2001, 3000)
    test_interval_list = [
        interval1_1001_2000, interval1_2001_3000, interval1_3001_4000,
        interval1_10000_20000, interval2_1001_2000, interval2_2001_3000
    ]
    test_interval_collection_header = "@TESTHEADER"
    test_interval_collection = IntervalCollection(
        interval_list=test_interval_list,
        header=test_interval_collection_header)

    # Test search method
    assert test_interval_collection.find_intersection(Interval(
        "1", 500, 1500)) == [interval1_1001_2000]
    assert test_interval_collection.find_intersection(Interval(
        "1", 2500, 3500)) == [interval1_2001_3000, interval1_3001_4000]
    assert test_interval_collection.find_intersection(
        Interval("1", 100000, 100100)) == []

    # Test find_intersection_with_interval_and_truncate
    assert test_interval_collection.find_intersection_with_interval_and_truncate(
        Interval("1", 2500, 3500)) == [
            Interval("1", 2500, 3000),
            Interval("1", 3001, 3500)
        ]
    # Test subtraction operator
    interval_collection_to_subtract = IntervalCollection([
        Interval("1", 1001, 2000),
        Interval("1", 2500, 2501),
        Interval("3", 100, 200)
    ])
    assert test_interval_collection - interval_collection_to_subtract == IntervalCollection(
        interval_list=[
            interval1_3001_4000, interval1_10000_20000, interval2_1001_2000,
            interval2_2001_3000
        ],
        header=test_interval_collection_header)
    def subset_intervals_to_rare_regions(
            self, intervals: IntervalCollection,
            max_allelic_fraction: float) -> IntervalCollection:
        """
        Subset a given interval collection to those intervals who only overlap with rare events, as defined by
        max_allele_fraction

        :param intervals: interval collection to subset
        :param max_allelic_fraction: events below that threshold are considered rare
        :return: subset of rare intervals
        """
        # TODO Handle the case of an interval from collection overlapping multiple events. In that case we should only consider
        # allelic fraction of the event with the largest reciprocal overlap
        intervals_pr = intervals.pyrange
        overlaps = self.truth_callset_pyrange.intersect(intervals_pr)
        joined_pr = intervals_pr.join(overlaps, how="left")
        rare_intervals_pr = joined_pr.subset(
            lambda df: df["Frequency"] <= max_allelic_fraction)
        rare_intervals_pr = rare_intervals_pr.merge()
        rare_intervals_pr = pr.PyRanges(
            rare_intervals_pr.df)  # This is to resolve a dtypes issue
        return IntervalCollection(rare_intervals_pr)
def evaluate_cnv_callsets_and_plot_results(analyzed_intervals: str, truth_callset_bed: str, gcnv_vcfs: List[str],
                                           output_directory: str):
    print("Reading in interval list...", flush=True)
    interval_collection = IntervalCollection.read_interval_list(analyzed_intervals)
    print("Reading in gCNV callset...", flush=True)
    gcnv_callset = GCNVCallset.read_in_callset(gcnv_segment_vcfs=gcnv_vcfs)
    print("Reading in truth callset...", flush=True)
    truth_callset = TruthCallset.read_in_callset(truth_callset_bed_file=truth_callset_bed,
                                                 interval_collection=interval_collection,
                                                 samples_to_keep=gcnv_callset.sample_set)
    print("Filtering truth callset...", flush=True)
    truth_callset.filter_out_uncovered_events(interval_collection)

    print("Performing per event evaluation...", flush=True)
    per_event_evaluator = PerEventEvaluator(truth_callset=truth_callset)
    per_event_evaluation_result = per_event_evaluator.evaluate_callset_against_the_truth(gcnv_callset=gcnv_callset)
    plotting.plot_and_save_per_event_evaluation_results(per_event_evaluation_result, output_directory)

    print("Performing per event evaluation...", flush=True)
    rare_intervals_subset = truth_callset.subset_intervals_to_rare_regions(interval_collection, max_allelic_fraction=0.01)
    per_bin_evaluator = PerBinEvaluator(truth_callset=truth_callset, interval_collection=rare_intervals_subset)
    # TODO pass an optional number of PR curve points parameter
    per_bin_evaluation_result = per_bin_evaluator.evaluate_callset_against_the_truth(gcnv_callset)
    plotting.plot_and_save_per_bin_evaluation_results(per_bin_evaluation_result, output_directory)
Exemplo n.º 9
0
    def read_in_callset(cls, **kwargs):
        assert "truth_file" in kwargs
        truth_file = kwargs["truth_file"]
        interval_file = kwargs["interval_file"]
        ref_dict = kwargs["reference_dictionary"]
        allele_frequency_threshold = kwargs["allele_frequency_threshold"]

        considered_interval_collection = IntervalCollection.read_interval_list(
            interval_file)

        truth_calls_pd = pd.read_csv(
            open(truth_file, 'r'),
            sep="\t",
            comment="#",
            header=None,
            names=["chrom", "start", "end", "name", "svtype", "samples"],
            dtype={
                "chrom": str,
                "start": int,
                "end": int,
                "name": str,
                "svtype": str,
                "samples": str
            })

        # Do a single pass over the truth callset to initialize the set of samples contained in it
        sample_set = set()
        for index, row in truth_calls_pd.iterrows():
            sample_names = set(row["samples"].split(","))
            sample_set.update(sample_names)

        # Do the second pass to initialize everything else
        sample_to_calls_map = {}
        # Initialize callset
        for sample in sample_set:
            sample_to_calls_map[sample] = []
        previous_interval_truth = None
        number_of_not_rescued_overlapping_events = 0
        number_of_overlapping_events_same_genotype = 0
        number_of_enveloped_events = 0
        overall_events = 0
        event_filtered_out_due_allele_freq_threshold = 0
        for index, row in truth_calls_pd.iterrows():
            event_type = cls.__get_event_type_from_sv_type(row["svtype"])

            if event_type is None:
                continue

            interval = Interval(row["chrom"], int(row["start"]),
                                int(row["end"]))
            if previous_interval_truth is not None and interval.chrom == previous_interval_truth.chrom \
                    and interval.start < previous_interval_truth.start:
                raise ValueError(
                    "Intervals Interval(%s) and Interval(%s) in truth callset are not in sorted order"
                    % (previous_interval_truth, interval))
            # Do not include calls outside of our interval list of interest
            if not considered_interval_collection.find_intersection(interval):
                continue
            # Do not include calls with allele frequency above specified
            sample_names = set(row["samples"].split(","))
            overall_events += len(sample_names)
            if len(sample_names) / len(
                    sample_set) > allele_frequency_threshold:
                event_filtered_out_due_allele_freq_threshold += len(
                    sample_names)
                continue
            for sample_name in sample_names:
                call = Call(interval=interval,
                            sample=sample_name,
                            event_type=event_type,
                            call_attributes=None)
                if len(sample_to_calls_map.get(sample_name)
                       ) > 0 and sample_to_calls_map.get(
                           sample_name)[-1].interval.intersects_with(interval):
                    last_interval = sample_to_calls_map.get(
                        sample_name)[-1].interval
                    last_call = sample_to_calls_map.get(sample_name)[-1]
                    if last_interval.end <= interval.end and last_call.event_type == call.event_type:
                        # Merge overlapping events with the same call
                        new_interval = Interval(interval.chrom,
                                                last_interval.start,
                                                interval.end)
                        sample_to_calls_map.get(
                            sample_name)[-1].interval = new_interval
                        number_of_overlapping_events_same_genotype += 1
                    elif interval.end < last_interval.end:
                        # If one call is contained in another only keep the larger call
                        number_of_enveloped_events += 1
                    else:
                        number_of_not_rescued_overlapping_events += 1
                    continue
                sample_to_calls_map.get(sample_name).append(call)
            previous_interval_truth = interval

        for sample_name in sample_set:
            interval_to_call_map = OrderedDict()
            if sample_to_calls_map.get(sample_name) is None:
                continue
            for index in range(len(sample_to_calls_map.get(sample_name))):
                interval_to_call_map[sample_to_calls_map.get(sample_name)[index].interval] = \
                    sample_to_calls_map.get(sample_name)[index]
            sample_to_calls_map[sample_name] = FeatureCollection(
                interval_to_call_map)

        io_plt.log("There are %d unique samples in truth set." %
                   (len(sample_set)))
        io_plt.log(
            "There are %d events for all samples in the truth call set." %
            overall_events)
        io_plt.log(
            "There are %d events that were filtered out due to the allele frequency threshold."
            % event_filtered_out_due_allele_freq_threshold)
        io_plt.log(
            "There are %d intersecting events in truth set that were not rescued."
            % number_of_not_rescued_overlapping_events)
        io_plt.log("There are %d overlapping events with the same genotype." %
                   number_of_overlapping_events_same_genotype)
        io_plt.log("There are %d enveloped events with different genotypes." %
                   number_of_enveloped_events)
        return cls(sample_to_calls_map, ref_dict)