def validate_reference_contig_coverage(ref_contigs, shared_contigs, min_coverage_fraction): """Validates that shared_contigs spans a sufficient amount of ref_contigs. Args: ref_contigs: List of ContigInfo protos. All of the contigs from our reference genome. shared_contigs: The subset of ref_contigs that we found in common with ref_contigs and all other genomics data sources. min_coverage_fraction: The minimum fraction of basepairs of ref_contigs that should be found among the shared_contigs. Raises: ValueError: If the fraction of covered bases is less than min_coverage_fraction. """ def format_contig_matches(): pieces = [] common_map = ranges.contigs_dict(shared_contigs) for ref_contig in ref_contigs: status = 'matched' if ref_contig.name in common_map else 'IS MISSING' pieces.append('"{}" is {} bp and {}'.format(ref_contig.name, ref_contig.n_bases, status)) return ', '.join(pieces) ref_bp = ranges.contigs_n_bases(ref_contigs) common_bp = ranges.contigs_n_bases(shared_contigs) coverage = common_bp / (1. * ref_bp) if not shared_contigs or coverage < min_coverage_fraction: raise ValueError('Reference contigs span {} bases but only {} bases ' '({:.2%}) were found in common among our input files. ' 'Check that the sources were created on a common genome ' 'reference build. Contig matches were: {}'.format( ref_bp, common_bp, coverage, format_contig_matches()))
def test_contigs_n_bases(self): c1 = reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0) c2 = reference_pb2.ContigInfo(name='a', n_bases=50, pos_in_fasta=1) c3 = reference_pb2.ContigInfo(name='b', n_bases=25, pos_in_fasta=2) self.assertEqual(100, ranges.contigs_n_bases([c1])) self.assertEqual(50, ranges.contigs_n_bases([c2])) self.assertEqual(25, ranges.contigs_n_bases([c3])) self.assertEqual(150, ranges.contigs_n_bases([c1, c2])) self.assertEqual(125, ranges.contigs_n_bases([c1, c3])) self.assertEqual(175, ranges.contigs_n_bases([c1, c2, c3]))