def __init__(self, locus, index, start, stop, seq): """Build new HomAltAllele.""" self.locus = locus self.index = index self.start = start self.stop = stop self.seq = normalize_seq(seq)
def __init__(self, locus, index, start, stop, seq, phase=None): """Build new HetAltAllele.""" self.locus = locus self.index = index self.start = start self.stop = stop self.seq = normalize_seq(seq) self.phase = phase
def build_match_strings(ref, start, stop, allele, mode='sensitive', debug=False): """Build allele matching strings.""" alts = allele.alts if debug: print(' Allele: start={}, stop={}, size={}, ref={}, seq={}'.format( allele.start, allele.stop, allele.stop - allele.start, allele.ref, ','.join(alts), ), file=sys.stderr) super_ref = normalize_seq(ref[start:stop]) # Require reference matches within the wobble zone + padding built into each normalized allele if mode == 'specific': super_alleles = [normalize_seq(ref[start:allele.start] + alt + ref[allele.stop:stop]) for alt in alts] elif mode == 'sensitive': super_alleles = [ normalize_seq( '*' * (allele.min_start - start) + ref[allele.min_start:allele.start] + alt + ref[allele.stop:allele.max_stop] + '*' * (stop - allele.max_stop) ) for alt in alts ] else: raise ValueError(f'invalid match mode specified: {mode}') if debug: print(' MODE:', mode, file=sys.stderr) print(' SUPER ALLELES:', super_alleles, file=sys.stderr) print(' SUPER REF: ', super_ref, file=sys.stderr) assert all(len(a) == stop - start - len(allele.ref) + len(alt) for a, alt in zip(super_alleles, alts)) return super_ref, super_alleles
def __init__(self, start, stop, seq): self.start = start self.stop = stop self.seq = normalize_seq(seq)
def seq(self): return normalize_seq(self.ref[self.start:self.stop])
def __init__(self, start, stop, seq, phase=None): self.start = start self.stop = stop self.seq = normalize_seq(seq) self.phase = phase
def find_allele(ref, allele, superlocus, debug=False): # FASTPATH: Avoid constructing the graph match if the allele and the superlocus # match perfectly. if (len(superlocus) == 1 and allele.start == superlocus[0].start and allele.stop == superlocus[0].stop and allele.alleles[1] in superlocus[0].alleles[1:] and 'PASS' in superlocus[0].record.filter): i = superlocus[0].alleles.index(allele.alleles[1]) z = superlocus[0].allele_indices.count(i) return z # Bounds come from normalized extremes start, stop = get_superlocus_bounds([[allele], superlocus]) if debug: print(' Allele: start={}, stop={}, size={}, seq={}'.format(allele.start, allele.stop, allele.stop-allele.start, allele.alleles[1]), file=sys.stderr) # Require reference matches within the wobble zone + padding built into each normalized allele super_allele = ('*' * (allele.min_start - start) + ref[allele.min_start:allele.start] + allele.alleles[1] + ref[allele.stop:allele.max_stop] + '*' * (stop - allele.max_stop)) super_allele = normalize_seq(super_allele) assert len(super_allele) == stop - start - len(allele.alleles[0]) + len(allele.alleles[1]) # Create genotype sets for each superlocus try: graph, constraints = generate_graph(ref, start, stop, superlocus, debug) graph = list(graph) if debug: for i, (start, stop, alleles) in enumerate(graph): print(' GRAPH{:02d}: start={}, stop={}, alleles={}'.format(i, start, stop, alleles), file=sys.stderr) print(file=sys.stderr) paths = generate_paths(graph, debug=debug) if debug: paths = list(paths) for i, p in enumerate(paths): print(' PATH{:02d}: {}'.format(i, p), file=sys.stderr) print(file=sys.stderr) except OverlapError: return None # Generate the set of diploid genotypes (actually haplotypes) genos = set(generate_genotypes(paths, constraints, debug)) # Apply matcher to each pair of allele matches = [(fancy_match(super_allele, a1), fancy_match(super_allele, a2)) for a1, a2 in genos] # Find the highest zygosity z = max(((a1 or 0) + (a2 or 0)) for a1, a2 in matches) # If no match, check for the presense of any nocalls if not z and any(None in m for m in matches): z = None if debug: print(' ALLELE:{} {}'.format(len(super_allele), super_allele), file=sys.stderr) for i, (g, m) in enumerate(zip(genos, matches)): print(' GENO{:02d}:{} {}'.format(i, tuple(map(len, g)), g), file=sys.stderr) print(' MATCH{:02d}: {}'.format(i, m), file=sys.stderr) print(file=sys.stderr) print(' ZYGOSITY: {}'.format(z), file=sys.stderr) return z
def seq(self): """Return the sequence of this allele.""" return normalize_seq(self.ref[self.start:self.stop])
def test_normalize_seq(): """Test sequence normalization.""" assert normalize_seq('') == '' assert normalize_seq('ACGT') == 'ACGT' assert normalize_seq('ACGTNacgtNRYSWKMBDHV') == 'ACGTNACGTNNNNNNNNNNN'
def test_normalize_seq(): assert normalize_seq('') == '' assert normalize_seq('ACGT') == 'ACGT' assert normalize_seq('ACGTNacgtNRYSWKMBDHV') == 'ACGTNACGTNNNNNNNNNNN'