def calculate_area(nested_genotype: pandas.Series, unnested_genotype: pandas.Series) -> float: """ Calculates a score based on the probability the unnested genotype is a subset of the nested_genotype. This take into account both the area of the nested genotype and the area of all other genotypes not in the nested genotype. Parameters ---------- nested_genotype, unnested_genotype: pandas.Series """ # If the nested genotype is not fixed, group the remaining frequencies into an `other` category. # noinspection PyTypeChecker other_genotypes: pandas.Series = 1 - nested_genotype area_nested = areascore.area_of_series(nested_genotype) area_unnested = areascore.area_of_series(unnested_genotype) area_other = areascore.area_of_series(other_genotypes) common_area_nested = areascore.calculate_common_area( nested_genotype, unnested_genotype) common_area_other = areascore.calculate_common_area( other_genotypes, unnested_genotype) is_subset_nested = areascore.is_subset_legacy(area_nested, area_unnested, common_area_nested) is_subset_other = areascore.is_subset_legacy(area_other, area_unnested, common_area_other) if is_subset_nested and is_subset_other: score = int(common_area_nested > 2 * common_area_other) elif is_subset_nested: score = 2 else: score = -2 # logger.debug(f"{unnested_genotype.name}\t{nested_genotype.name}\t{score}") return score
def test_area_of_series_real(): filename = list(real_tables.values())[0] table_genotype = pandas.read_excel( filename, sheet_name="genotype").set_index('Genotype') for index, row in table_genotype.iterrows(): logger.debug(index) expected_area = areascore.calculate_area(row) actual_area = areascore.area_of_series(row) assert pytest.approx(actual_area, abs=.001) == expected_area
def test_calculate_overlapping_area(full_overlap, no_overlap, partial_overlap): left, right = full_overlap right_area = areascore.area_of_series(right) overlap_area = areascore.calculate_common_area(left, right) assert pytest.approx(overlap_area, abs=0.001) == right_area left, right = no_overlap overlap_area = areascore.calculate_common_area(left, right) assert pytest.approx(overlap_area, abs=0.001) == 0 left, right = partial_overlap overlap_area = areascore.calculate_common_area(left, right) assert pytest.approx(overlap_area, abs=0.001) == right_area left = pandas.Series([ 0.01, 0.279, 0.341, 0.568, 0.708, 0.913, 0.756, 0.455, 0.399, 0.13, 0.041 ]) right = pandas.Series( [0, 0, 0, 0, 0, 0.247, 0.388, 0.215, 0.399, 0.13, 0.028]) overlap_area = areascore.calculate_common_area(left, right) assert pytest.approx(overlap_area) == areascore.area_of_series(right)
def test_area_of_series(series, expected): s = pandas.Series(series) result = areascore.area_of_series(s) assert pytest.approx(result, expected)
def jaccard_distance(left: pandas.Series, right: pandas.Series) -> float: area_left = area_of_series(left) area_right = area_of_series(right) area_shared = calculate_common_area(left, right) j = area_shared / (area_left + area_right - area_shared) return 1 - j
def test_shoelace(trajectory_table, key, expected): # Keep this for now to make sure the area is being calculated correctly. series = trajectory_table.loc[key] result = areascore.area_of_series(series) assert pytest.approx(result, abs = 0.01) == expected
def calculate_score_area(self, nested_genotype: pandas.Series, unnested_genotype: pandas.Series) -> float: """ Calculates a score based on the probability the unnested genotype is a subset of the nested_genotype. This take into account both the area of the nested genotype and the area of all other genotypes not in the nested genotype. Parameters ---------- nested_genotype, unnested_genotype: pandas.Series """ # If the nested genotype is not fixed, group the remaining frequencies into an `other` category. difference_series = nested_genotype - unnested_genotype if difference_series.mean() > 0: # noinspection PyTypeChecker other_genotypes: pandas.Series = self.flimit - nested_genotype else: # noinspection PyTypeChecker other_genotypes: pandas.Series = self.flimit - unnested_genotype # In case we're testing if a small genotype contains a large genotype other_genotypes = other_genotypes.mask( lambda s: s < 0, 0.0001) # Since the flimit is not exactly 1. unnested_polygon = polygon.as_polygon(unnested_genotype) nested_polygon = polygon.as_polygon(nested_genotype) other_polygon = polygon.as_polygon(other_genotypes) is_subset_nested = areascore.is_subset_polygon(nested_polygon, unnested_polygon) is_subset_other = areascore.is_subset_polygon(other_polygon, unnested_polygon) is_subset_nested_reversed = areascore.is_subset_polygon( unnested_polygon, nested_polygon) # Check the reverse case nested_area = areascore.area_of_series(nested_genotype) unnested_area = areascore.area_of_series(unnested_genotype) common_area_nested = areascore.X_and_Y_polygon(unnested_polygon, nested_polygon) xor_area_unnested = areascore.difference_polygon( unnested_polygon, nested_polygon ) # This does not distinguish between xor left vs xor right if self.debug: logger.debug( f"calculate_score_jaccard()->({is_subset_nested}, {is_subset_other}, {is_subset_nested_reversed}), ({common_area_nested:.2f}, {xor_area_unnested:.2f})" ) if is_subset_nested and is_subset_other: # Evidence for both scenarios # Test if the nested genotype is sufficiently large to assume the unnested genotype is a subset. # Test only the area where the unnested genotype was detected. other_polygon = polygon.as_polygon(other_genotypes) common_area_other = areascore.X_and_Y_polygon( unnested_polygon, other_polygon) score = int(common_area_nested > 2 * common_area_other) elif is_subset_nested: score = 1 elif is_subset_nested_reversed and not is_subset_other: score = -1 else: score = 0 if score == 0 and xor_area_unnested > common_area_nested * 2: score = -1 elif unnested_area > 2 * nested_area: score = -1 score = score * self.weight_jaccard return score