def label_peaks_parsimonious(peaks, known_emission_lines, delta): ''' Out of all the peaks that are within +/-delta of at least one known emission line, find a minimal set of elements (via unweighted greedy set cover) that "explains" all such peaks ''' known_emission_lines = known_emission_lines.sort_values(by="wav_mars") peaks = sorted(peaks) elts_to_peak_indices = defaultdict(set) for row_index, row in known_emission_lines.iterrows(): elt = row["elt"] i_lo, i_hi = nearby_peaks(peaks, row["wav_mars"], delta = delta) for i in xrange(i_lo, i_hi): elts_to_peak_indices[elt].add(i) sets = [] elts = sorted(list(np.unique(known_emission_lines["elt"]))) for elt in elts: sets.append(list(elts_to_peak_indices[elt])) best_cover = set_cover_approx_fast(sets) cover_elts_to_peaks = OrderedDict() cover_peaks_to_elts = OrderedDict() unlabeled = set(peaks) for i in best_cover: elt = elts[i] for peak_index in elts_to_peak_indices[elt]: peak = peaks[peak_index] if elt not in cover_elts_to_peaks: cover_elts_to_peaks[elt] = [] cover_elts_to_peaks[elt].append(peak) if peak not in cover_peaks_to_elts: cover_peaks_to_elts[peak] = [] cover_peaks_to_elts[peak].append(elt) unlabeled -= set([peak]) return cover_elts_to_peaks, cover_peaks_to_elts, sorted(list(unlabeled))
def test_set_cover_approx_fast(): result = set_cover_approx_fast(example_sets) assert result == [4, 3, 2] result = set_cover_approx_fast([[30], [20,30], [10]]) assert result == [1, 2]