def testChromosomeConversionHG19(self): """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y """ self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19")) self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
def _determine_matching_alt_indices(self, mut, record, build): """ :param mut: :param record: :return: """ indices = [] if record.is_monomorphic: chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = record.POS endPos = record.POS ref_allele = record.REF if self.match_mode == "exact": if mut.chr == chrom and mut.ref_allele == ref_allele: indices = [-1] else: if mut.chr == chrom and int(mut.start) <= startPos and int( mut.end) >= endPos: indices = [-1] else: # Iterate over all alternates in the record for index in xrange(0, len(record.ALT)): chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = record.POS endPos = record.POS ref = str(record.REF) alt = str(record.ALT[index]) ds_mut = MutUtils.initializeMutFromAttributes( chrom, startPos, endPos, ref, alt, build) if self.match_mode == "exact": if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): indices += [index] else: # cases whether the match mode isn't exact if mut.chr == ds_mut.chr and int(mut.start) == int( ds_mut.start) and int(mut.end) == int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \ and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int( ds_mut.start) and int(mut.end) >= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \ and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start): indices += [index] # if len(indices) == 0: # indices = [None] return indices
def _determine_matching_alt_indices(self, mut, record, build): """ :param mut: :param record: :return: """ indices = [] if record.is_monomorphic: chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = record.POS endPos = record.POS ref_allele = record.REF if self.match_mode == "exact": if mut.chr == chrom and mut.ref_allele == ref_allele: indices = [-1] else: if mut.chr == chrom and int(mut.start) <= startPos and int(mut.end) >= endPos: indices = [-1] else: # Iterate over all alternates in the record for index in xrange(0, len(record.ALT)): chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = record.POS endPos = record.POS ref = str(record.REF) alt = str(record.ALT[index]) ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) if self.match_mode == "exact": if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): indices += [index] else: # cases whether the match mode isn't exact if mut.chr == ds_mut.chr and int(mut.start) == int(ds_mut.start) and int(mut.end) == int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \ and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) and int(mut.end) >= int(ds_mut.end): indices += [index] elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \ and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start): indices += [index] # if len(indices) == 0: # indices = [None] return indices
def testChromosomeConversionHG19(self): """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y """ self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19")) self.assertEqual( MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._specified_fields for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = self._mutation_data_factory.create(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() geneTuple = self.gene_id_idx.get(gene, None) if geneTuple is None: return result ctr = 0 contig = MutUtils.convertChromosomeStringToMutationDataFormat(geneTuple[0]) for b in self.Transcripts.get(contig, []): for i in self.Transcripts[contig][b]: if i["gene"] == gene: if isCodingOnly and gaf_annotation.is_non_coding_transcript(i, self): ctr += 1 continue if isCodingOnly: genomic_coords = self.getCodingTranscriptCoords(i) else: genomic_coords = i["genomic_coords"] for coord in genomic_coords: start = min(coord[0], coord[1]) end = max(coord[0], coord[1]) result.add((gene, i["chr"], str(start - padding), str(end + padding))) return result
def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() geneTuple = self.gene_id_idx.get(gene, None) if geneTuple is None: return result ctr = 0 contig = MutUtils.convertChromosomeStringToMutationDataFormat( geneTuple[0]) for b in self.Transcripts.get(contig, []): for i in self.Transcripts[contig][b]: if i['gene'] == gene: if isCodingOnly and gaf_annotation.is_non_coding_transcript( i, self): ctr += 1 continue if isCodingOnly: genomic_coords = self.getCodingTranscriptCoords(i) else: genomic_coords = i['genomic_coords'] for coord in genomic_coords: start = min(coord[0], coord[1]) end = max(coord[0], coord[1]) result.add((gene, i['chr'], str(start - padding), str(end + padding))) return result
def _createMutation(self, record, alt_index, build): chrom = MutUtils.convertChromosomeStringToMutationDataFormat( record.CHROM) startPos = int(record.POS) endPos = int(record.POS) ref = record.REF.strip() ref = "" if ref == "." else ref alt = ref if not record.is_monomorphic: alt = str(record.ALT[alt_index]).strip() mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build, self._mutation_data_factory) ID = "" if record.ID is None else record.ID mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID]) mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL]) mut.createAnnotation("alt_allele_seen", str(True), "INPUT") if self.collapse_filter_fields: mut = self._add_filter_data_2_mutation_single_field(mut, record) else: mut = self._addFilterData2Mutation(mut, record) mut = self._addInfoData2Mutation(mut, record, alt_index) return mut
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._specified_fields for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = self._mutation_data_factory.create(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(), mut.alt_allele.strip() #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def _createMutation(self, record, alt_index, build): chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM) startPos = int(record.POS) endPos = int(record.POS) ref = record.REF ref = "" if ref == "." else ref alt = ref if not record.is_monomorphic: alt = str(record.ALT[alt_index]) mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) ID = "" if record.ID is None else record.ID mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID]) mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL]) mut.createAnnotation("alt_allele_seen", str(True), "INPUT") mut = self._addFilterData2Mutation(mut, record) mut = self._addInfoData2Mutation(mut, record, alt_index) return mut
def _convertGFFRecordToTranscript(self, gff_record, seq_dict, seq_dict_keys, tx_to_protein_mapping): """ :param gff_record: :param seq_dict: :return: None if the record is a gene record or otherwise does not represent a transcript, CDS, *_codon, or exon """ types_of_interest = ["exon", "CDS", "start_codon", "stop_codon"] if gff_record['type'] not in types_of_interest: return None quals = gff_record['quals'] transcript_id = quals['transcript_id'][0] try: tx = self._transcript_index[transcript_id] except KeyError: # Create the initial record for this transcript. contig = MutUtils.convertChromosomeStringToMutationDataFormat(gff_record['rec_id']) tx = Transcript(transcript_id, gene=quals['gene_name'][0], gene_id=quals['gene_id'][0], contig=contig) self._transcript_index[transcript_id] = tx # Set the gene_type based on gene_type or gene_biotype key = "gene_biotype" if key not in quals.keys(): key = "gene_type" self._transcript_index[transcript_id].set_gene_type(quals.get(key, [""])[0]) if gff_record['strand'] == 1: self._transcript_index[transcript_id].set_strand("+") else: self._transcript_index[transcript_id].set_strand("-") qual_keys = quals.keys() for attribute in GenomeBuildFactory.QUALS_TO_CHECK: if attribute in qual_keys: self._transcript_index[transcript_id].add_other_attribute(attribute, "|".join(quals[attribute])) seq = seq_dict.get(transcript_id, None) if seq is not None: genome_seq_as_str = str(seq.seq) else: genome_seq_as_str = "" self._transcript_index[transcript_id].set_seq(genome_seq_as_str) tx_id_for_protein_lookup = transcript_id if '.' in transcript_id: tx_id_for_protein_lookup = tx_id_for_protein_lookup[:tx_id_for_protein_lookup.index('.')] self._transcript_index[transcript_id].set_protein_id(tx_to_protein_mapping.get(tx_id_for_protein_lookup, "")) tx = self._transcript_index[transcript_id] gff_type = gff_record['type'] if gff_type == 'exon': tx.add_exon(gff_record['location'][0], gff_record['location'][1], quals['exon_number'][0]) elif gff_type == 'CDS': tx.add_cds(gff_record['location'][0], gff_record['location'][1]) elif gff_type == 'start_codon': tx.set_start_codon(gff_record['location'][0], gff_record['location'][1]) elif gff_type == 'stop_codon': tx.set_stop_codon(gff_record['location'][0], gff_record['location'][1])
def _convertGFFRecordToTranscript(self, gff_record, seq_dict, seq_dict_keys, tx_to_protein_mapping): """ :param gff_record: :param seq_dict: :return: None if the record is a gene record or otherwise does not represent a transcript, CDS, *_codon, or exon """ types_of_interest = ["exon", "CDS", "start_codon", "stop_codon"] if gff_record['type'] not in types_of_interest: return None quals = gff_record['quals'] transcript_id = quals['transcript_id'][0] try: tx = self._transcript_index[transcript_id] except KeyError: # Create the initial record for this transcript. contig = MutUtils.convertChromosomeStringToMutationDataFormat( gff_record['rec_id']) tx = Transcript(transcript_id, gene=quals['gene_name'][0], gene_id=quals['gene_id'][0], contig=contig) self._transcript_index[transcript_id] = tx # Set the gene_type based on gene_type or gene_biotype key = "gene_biotype" if key not in quals.keys(): key = "gene_type" self._transcript_index[transcript_id].set_gene_type( quals.get(key, [""])[0]) if gff_record['strand'] == 1: self._transcript_index[transcript_id].set_strand("+") else: self._transcript_index[transcript_id].set_strand("-") qual_keys = quals.keys() for attribute in GenomeBuildFactory.QUALS_TO_CHECK: if attribute in qual_keys: self._transcript_index[transcript_id].add_other_attribute( attribute, "|".join(quals[attribute])) seq = seq_dict.get(transcript_id, None) if seq is not None: genome_seq_as_str = str(seq.seq) else: genome_seq_as_str = "" self._transcript_index[transcript_id].set_seq(genome_seq_as_str) tx_id_for_protein_lookup = transcript_id if '.' in transcript_id: tx_id_for_protein_lookup = tx_id_for_protein_lookup[: tx_id_for_protein_lookup .index('.' )] self._transcript_index[transcript_id].set_protein_id( tx_to_protein_mapping.get(tx_id_for_protein_lookup, "")) tx = self._transcript_index[transcript_id] gff_type = gff_record['type'] if gff_type == 'exon': tx.add_exon(gff_record['location'][0], gff_record['location'][1], quals['exon_number'][0]) elif gff_type == 'CDS': tx.add_cds(gff_record['location'][0], gff_record['location'][1]) elif gff_type == 'start_codon': tx.set_start_codon(gff_record['location'][0], gff_record['location'][1]) elif gff_type == 'stop_codon': tx.set_stop_codon(gff_record['location'][0], gff_record['location'][1])