예제 #1
0
 def __validate_input(ref, seq):
     if len(seq) != ref.length_with_deletions():
         raise weCallException(
             "Sequence has to be of the same length as reference.")
     if not re.match(r'^[ACGTURYKMSWBDHVN\*\.]*\Z', seq):
         raise weCallException(
             "Illegal character in sequence {!r}".format(seq))
예제 #2
0
    def __validate_input(ref_char, seq_char, qual_char):
        if not all(len(c) == 1 for c in [ref_char, seq_char, qual_char]):
            raise weCallException(
                "All characters at sequence position has to be of length 1.")

        if ref_char == MISSING_BASE:
            raise weCallException("Missing reference character.")
예제 #3
0
    def __validate_character_combination(self):
        if self.ref_char == DELETED_BASE and self.seq_char == MATCHING_BASE:
            raise weCallException(
                "Invalid character combination: ref char = {}, sequence char = {}"
                .format(self.ref_char, self.seq_char))

        if self.seq_char == DELETED_BASE and self.qual_char != MISSING_BASE:
            raise weCallException(
                "Cannot assign base quality to a deleted base.")
        if self.is_gap and self.qual_char != MISSING_BASE:
            raise weCallException("Cannot assign base quality inside a gap.")
예제 #4
0
파일: sample_data.py 프로젝트: dylex/wecall
    def add_sample_data(self, sample_name, key_name, sample_data_value):
        if key_name not in self.__key_to_sample_values:
            raise weCallException(
                "Missing key {} when adding sample data.".format(key_name))

        if sample_name not in self.__sample_names:
            raise weCallException(
                "Missing sample name {} supplied when adding sample data.".
                format(sample_name))

        if key_name == GENOTYPE_KEY and not isinstance(sample_data_value,
                                                       GenotypeCall):
            raise weCallException("Genotype field must be a GenotypeCall.")

        self.__key_to_sample_values[key_name][self.__sample_names.index(
            sample_name)] = sample_data_value
예제 #5
0
파일: interval.py 프로젝트: dylex/wecall
def read_interval(interval_string):
    start_string, end_string = tuple(interval_string.split("-"))
    start, end = int(start_string), int(end_string)
    if end <= start:
        raise weCallException(
            "Interval {} does not have start < end".format(interval_string))
    return Interval(start, end)
예제 #6
0
파일: sample_data.py 프로젝트: dylex/wecall
 def get_variant_support(self, sample_name):
     for key in list(VARIANT_SUPPORT_MAP.keys()):
         if self.has_genotype_key(key):
             return VARIANT_SUPPORT_MAP[key](self.get_field(
                 sample_name, key))
     raise weCallException(
         "Expected one of {} as the variant support key.".format(
             list(VARIANT_SUPPORT_MAP.keys())))
예제 #7
0
    def __init__(self, quality_string, quality_mapping=QUALITY_MAPPING):
        if not SequenceQuality.is_valid_qual(quality_string):
            raise weCallException(
                "Illegal character in the quality string {!r}".format(
                    quality_string))

        self.quality_mapping = quality_mapping
        self.ascii_quality = self.parse_quality_to_ascii(quality_string)
예제 #8
0
 def add_sample_name(self, sample_name):
     if sample_name in self.__samples:
         raise weCallException(
             "Sample {} already exists in the SampleBank.".format(
                 sample_name))
     sequence_bank = SequenceBank(self.reference)
     self.__samples[sample_name] = sequence_bank
     return sequence_bank
예제 #9
0
    def index(self):
        tool_runner = ToolRunner()
        tool_runner.start(
            [os.path.join(os.environ['WECALL_BIN'], "samtools"), "faidx", self.filename])

        if tool_runner.return_code != 0:
            raise weCallException("")
        else:
            return self
예제 #10
0
    def __get_expected_calls_from_haplotypes(ascii_strings, reference):
        if len(ascii_strings) != 2:
            raise weCallException(
                "Expected calls have to be defined as a diploid.")
        if not all(len(str) == reference.length_with_deletions()
                   for str in ascii_strings):
            raise weCallException(
                "Ascii haplotypes have to be of the same length as the reference")

        vars_from_hap1 = Sequence(reference, ascii_strings[0]).variants
        vars_from_hap2 = Sequence(reference, ascii_strings[1]).variants

        calls = {}
        for var in vars_from_hap1.intersection(vars_from_hap2):
            calls[var] = GenotypeCall("1/1")
        for var in vars_from_hap1.symmetric_difference(vars_from_hap2):
            calls[var] = GenotypeCall("0/1")

        return calls
예제 #11
0
def trimmed_vcf_ref_alt(ref, alt):
    if len(ref) == 0 or len(alt) == 0:
        raise weCallException("VCF format requires non-empty ref and alt")
    if ref == alt and len(ref) > 1:
        raise weCallException("VCF requires refcalls of length 1")
    if alt == UNKNOWN or ref == UNKNOWN:
        # VCF allows this to indicate unknown data.
        raise weCallException("not dealing with monomorphic variants")
    offset, new_ref, new_alt = trimmed_ref_alt(ref, alt)
    start_context, end_context = 0, 0
    if len(ref) != len(alt) or (not new_ref and not new_alt):
        if offset == 0:
            end_context = 1
        else:
            start_context = 1
    result_ref =\
        ref[offset - start_context:offset] +\
        new_ref +\
        ref[offset + len(new_ref):offset + len(new_ref) + end_context]
    result_alt =\
        alt[offset - start_context:offset] +\
        new_alt +\
        alt[offset + len(new_alt):offset + len(new_alt) + end_context]
    return offset - start_context, result_ref, result_alt
예제 #12
0
def read_records(schema, line):
    """
    Extracts a sequence of `Record` objects from a single line in a VCF file.
    """
    try:
        cols = [l for l in line.strip().split("\t")]
        for item in generate_records(schema, cols):
            yield item
    except weCallException:
        raise
    except Exception:
        _, exc, tb = sys.exc_info()
        new_exc = weCallException(
            "while reading record from line {!r}: {!s}".format(
                line, exc.message))
        raise new_exc.__class__(new_exc).with_traceback(tb)
예제 #13
0
 def __potentially_merge_adjacent_variants(var_1, var_2):
     if var_1 is None or var_2 is None or var_1.type != var_2.type:
         return var_1, var_2
     else:
         if var_1.type == TYPE_SNP or var_1.type == TYPE_REF:
             return var_1, var_2
         elif var_1.type == TYPE_DEL:
             merged_variant = Variant(var_1.chrom, var_1.pos_from,
                                      var_1.ref + var_2.ref[-1], var_1.alt)
             return None, merged_variant
         elif var_1.type == TYPE_INS:
             merged_variant = Variant(var_1.chrom, var_1.pos_from,
                                      var_1.ref, var_1.alt + var_2.alt[-1])
             return None, merged_variant
         else:
             raise weCallException("Unexpected variant type: " +
                                   TYPE_TO_STR[var_1.type])
예제 #14
0
파일: sample_data.py 프로젝트: dylex/wecall
    def get_genotype_likelihoods(self, sample_name):
        def convert_likelihoods(likelihoods, factor):
            if likelihoods is None or likelihoods == '.':
                return likelihoods
            else:
                return [
                    None if value in {None, '.'} else value / factor
                    for value in likelihoods
                ]

        for key in list(LIKELIHOOD_SCALING_FACTOR.keys()):
            if self.has_genotype_key(key):
                values = self.get_field(sample_name, key)
                return convert_likelihoods(values,
                                           LIKELIHOOD_SCALING_FACTOR[key])
        raise weCallException(
            "Expected one of {} as the likelihood key.".format(
                list(LIKELIHOOD_SCALING_FACTOR.keys())))
예제 #15
0
    def _get_variants(self):
        variants = set()

        ref_index = self.pos_from - 1
        current_variant = None

        for ref_char, alt_char in zip(self._reference.ref_seq, self._seq):
            if ref_char != DELETED_BASE:
                ref_index += 1

            if ref_char == DELETED_BASE and alt_char == MATCHING_BASE:
                raise weCallException(
                    "Invalid sequence at ref position {}".format(ref_index))
            elif ref_char == DELETED_BASE and alt_char == DELETED_BASE:
                continue
            elif alt_char == MATCHING_BASE:
                current_variant = self.__add_variant_to_set(
                    current_variant, None, variants)
                continue

            if ref_char == DELETED_BASE:
                # insertion
                var_pos = ref_index
                var_ref = self._reference[var_pos]
                var_alt = var_ref + alt_char
            elif alt_char == DELETED_BASE:
                # deletion
                var_pos = ref_index - 1
                var_ref = self._reference[var_pos] + ref_char
                var_alt = self._reference[var_pos]
            else:
                # SNP
                var_pos = ref_index
                var_ref = ref_char
                var_alt = alt_char

            new_variant = Variant(self._reference.chrom, var_pos, var_ref,
                                  var_alt)
            current_variant = self.__add_variant_to_set(
                current_variant, new_variant, variants)

        self.__add_variant_to_set(current_variant, None, variants)
        variants = self.__remove_deletions_from_edges(variants)
        return variants
예제 #16
0
파일: sample_data.py 프로젝트: dylex/wecall
    def set_genotype_likelihoods(self, sample_name, likelihood_values):
        def convert_likelihoods(likelihoods, factor):
            if likelihoods is None or likelihoods == '.':
                return likelihoods
            else:
                return [
                    None if value in {None, '.'} else value * factor
                    for value in likelihoods
                ]

        for key in list(LIKELIHOOD_SCALING_FACTOR.keys()):
            if self.has_genotype_key(key):
                converted_values = convert_likelihoods(
                    likelihood_values, LIKELIHOOD_SCALING_FACTOR[key])
                self.add_sample_data(sample_name, key, converted_values)
                return
        raise weCallException(
            "Expected one of {} as the likelihood key.".format(
                list(LIKELIHOOD_SCALING_FACTOR.keys())))
예제 #17
0
    def __get_expected_calls_from_sample_ascii_haplotypes(
            ascii_haplotypes, reference):
        calls_per_variant = {}
        for sample_name, ascii_strings in ascii_haplotypes.items():
            calls_for_sample = AsciiWecallRunnerTest.__get_expected_calls_from_haplotypes(
                ascii_strings, reference)
            for variant, genotype in calls_for_sample.items():
                if variant in calls_per_variant and sample_name in calls_per_variant[variant]:
                    raise weCallException(
                        "Cannot supply multiple genotypes for "
                        "sample_name {} and variant {}.".format(
                            sample_name, variant))
                if variant not in calls_per_variant:
                    # ordered dict only to comply with what the actual calls
                    # look like
                    calls_per_variant[variant] = OrderedDict()

                calls_per_variant[variant][sample_name] = genotype

        return calls_per_variant
예제 #18
0
    def build_annotated_seq(self, n_fwd, n_rev, mapping_quality, insert_size,
                            read_id, read_flags, cigar_string, read_start,
                            read_mate_start):
        reference = ReferenceChromosome(self.reference_string, self.pos_from)
        sequence = Sequence(reference,
                            self.sequence_string.replace(",", ".").upper(),
                            cigar_string)
        quality = SequenceQuality(self.quality_string)

        read_sequence = ReadSequence(sequence, quality, mapping_quality,
                                     insert_size, read_id, read_flags,
                                     read_start, read_mate_start)
        if n_fwd is not None:
            return [ReadSequenceWithCoverage(read_sequence, n_fwd, n_rev)]
        elif self.is_reverse_seq():
            return [ReadSequenceWithCoverage(read_sequence, 0, 1)]
        elif self.is_forward_seq():
            return [ReadSequenceWithCoverage(read_sequence, 1, 0)]
        else:
            raise weCallException(
                "Raw sequence: {} is neither forward or reverse".format(self))
예제 #19
0
def _parse_flag(value):
    """
    Parses a 'flag' info field.  If flag is used as a
    proper flag the value is None and it is assumed that
    that means True.  Missing flag is unclear and hence not parsed.
    """
    if value == UNKNOWN:
        return None
    else:
        if value is None:
            return True
        if isinstance(value, bool):
            return value
        value = value.upper()
        if value in {'1', 'YES', 'TRUE'}:
            return True
        elif value in {'0', 'NO', 'FALSE'}:
            return False
        else:
            # For strict VCF parsing configure parser to throw on log warnings.
            # TODO: Work out how to configure logger to do this.
            logging.warning("Invalid flag {}".format(value))
            raise weCallException("Invalid flag {}".format(value))
예제 #20
0
 def __validate_ref_seq(self, ref_seq):
     if not re.match(r'^[ACGTURYKMSWBDHVN\*]*\Z', ref_seq):
         raise weCallException(
             "Illegal character in reference sequence {!r}".format(ref_seq))
예제 #21
0
def sequence_builder(
    reference,
    seq_string,
    quality_string=None,
    n_fwd=None,
    n_rev=None,
    mapping_quality=HIGH_QUALITY,
    insert_size=None,
    read_id=None,
    read_flags=None,
    cigar_string=None,
    read_start=None,
    read_mate_start=None,
):
    quality_string = " " * \
        len(seq_string) if quality_string is None else quality_string
    if not all(i is None
               for i in [n_fwd, n_rev]) and any(i is None
                                                for i in [n_fwd, n_rev]):
        raise weCallException(
            "Invalid combination of forward and reverse reads: n_fwd = {}, n_rev = {} "
            .format(n_fwd, n_rev))

    if len(seq_string) != reference.length_with_deletions():
        raise weCallException(
            "Sequence has to be of the same length as reference. seq_length {}, ref_length {}"
            .format(len(seq_string), reference.length_with_deletions()))

    if len(quality_string) != reference.length_with_deletions():
        raise weCallException(
            "Quality string has to be of the same length as reference.")

    ref_pos = reference.pos_from
    current_raw_seq = RawStringSequences()
    sequences = []
    for ref_char, seq_char, qual_char in zip(reference.ref_seq, seq_string,
                                             quality_string):
        seq_position = SequencePosition(ref_char, seq_char, qual_char)

        if seq_position.is_gap and current_raw_seq.is_ongoing:
            sequences.append(current_raw_seq)
            current_raw_seq = RawStringSequences()
        elif not seq_position.is_gap:
            current_raw_seq.add_position(seq_position, ref_pos)

        ref_pos = seq_position.update_ref_pos(ref_pos)

    if current_raw_seq.is_ongoing:
        sequences.append(current_raw_seq)

    annotated_seqs = []
    if (len(sequences) % 2 == 0 and all(
        (sequences[index].is_forward_seq()
         for index in range(0, len(sequences), 2))) and all(
             (sequences[index].is_reverse_seq()
              for index in range(1, len(sequences), 2)))):
        # sequence of read pairs
        pairs = list(
            zip((sequences[index] for index in range(0, len(sequences), 2)),
                (sequences[index] for index in range(1, len(sequences), 2))))
        for fwd, rev in pairs:
            annotated_seqs.extend(
                build_annotated_pair(fwd, rev, n_fwd, n_rev, mapping_quality,
                                     insert_size, read_id, read_flags,
                                     cigar_string, read_start,
                                     read_mate_start))
    else:
        # unpaired reads
        for seq in sequences:
            annotated_seqs.extend(
                seq.build_annotated_seq(n_fwd, n_rev, mapping_quality,
                                        insert_size, read_id, read_flags,
                                        cigar_string, read_start,
                                        read_mate_start))
    return annotated_seqs
예제 #22
0
파일: sample_data.py 프로젝트: dylex/wecall
 def get_read_depth(self, sample_name):
     for key in READ_DEPTH_KEYS:
         if self.has_genotype_key(key):
             return self.get_field(sample_name, key)
     raise weCallException(
         "Expected one of {} as the depth key.".format(READ_DEPTH_KEYS))
예제 #23
0
    def sequence_string(self):
        if not self.is_forward_seq() and not self.is_reverse_seq():
            raise weCallException("Illegal character in sequence {!r}".format(
                self.__sequence_string))

        return self.__sequence_string
예제 #24
0
def get_chromosome_index(chrom):
    try:
        return CHROMOSOME_ORDER[standardise_chromosome(chrom)]
    except KeyError:
        raise weCallException("Invalid chromosome {}".format(chrom))
예제 #25
0
def from_vcf_str(vcf_str, desired_type):
    try:
        return desired_type(vcf_str) if vcf_str != "." else None
    except ValueError:
        raise weCallException("Cannot cast {} to {!r}".format(
            vcf_str, desired_type))
예제 #26
0
 def __validate_expected_calls(expected_ascii, expected_stubs):
     if expected_ascii is None and expected_stubs is None:
         raise weCallException(
             "Expected variants have to be provided either in the ascii or variant stub format."
         )