def OutputStats(self, output_fname): fh = open(output_fname, 'w') fh.write( 'LineageID\tLineageSizeBeforeCleaning\tNumNonTrivialSeqs\tMaxMultiplicity\tClosestV\tClosestJ\tRootId\tRootSeq\tRootCDR3\tRootDistanceFromGermline\n' ) for l in sorted(self.clonal_lineages, key=lambda s: len(s), reverse=True): vj_ann = vj_annotator.VJGeneAnnotator(l) abundant_v = utils.GetBaseGeneName( vj_ann.GetAbundantGene(dataset.AnnotatedGene.V)) abundant_j = utils.GetBaseGeneName( vj_ann.GetAbundantGene(dataset.AnnotatedGene.J)) root_seq = self._GetRootSeq(l, abundant_v) num_shms_in_root = len( self.dataset.GetVSHMsOutsideCDR3(root_seq.id)) + len( self.dataset.GetJSHMsOutsideCDR3(root_seq.id)) fh.write(l.id() + '\t' + str(len(l)) + '\t' + str(self._GetNumberNonTrivialSequences(l)) + '\t' + str(self._GetHighestMultiplicity(l)) + '\t' + abundant_v + '\t' + abundant_j + '\t' + root_seq.id + '\t' + root_seq.seq + '\t' + self.dataset.GetCDR3BySeqName(root_seq.id) + '\t' + str(num_shms_in_root) + '\n') fh.close()
def SequenceIsGood(self, seq): v_gene = self.full_length_lineage.Dataset().GetGeneHitBySeqName( seq.id, dataset.AnnotatedGene.V) j_gene = self.full_length_lineage.Dataset().GetGeneHitBySeqName( seq.id, dataset.AnnotatedGene.J) return utils.GetBaseGeneName( v_gene) == self.most_freq_v and utils.GetBaseGeneName( j_gene) == self.most_freq_j
def _GetRootSeq(self, lineage, abundant_v): num_shms_all = sys.maxint root_seq_all = SeqRecord(Seq(''), id='') num_shms_prod = sys.maxint root_seq_prod = SeqRecord(Seq(''), id='') for seq in lineage.FullLengthSeqIdIter(): if utils.GetBaseGeneName( self.dataset.GetGeneHitBySeqName( seq.id, dataset.AnnotatedGene.V)) != abundant_v: continue cur_num_shms = len(self.dataset.GetVSHMsOutsideCDR3(seq.id)) + len( self.dataset.GetJSHMsOutsideCDR3(seq.id)) if cur_num_shms < num_shms_all: num_shms_all = cur_num_shms root_seq_all = seq cur_seq = seq.seq #self.dataset.GetCDR3BySeqName(seq.id) aa_seq = str(Seq(cur_seq).translate()) if aa_seq.find('*') != -1: continue if cur_num_shms < num_shms_prod: num_shms_prod = cur_num_shms root_seq_prod = seq if root_seq_prod.id != '': return root_seq_prod return root_seq_all
def _UpdateGeneDict(self, gene_type, lineage): root_seq_id = lineage.RootSeqId() gene_name = utils.GetBaseGeneName( lineage.Dataset().GetGeneHitBySeqName(root_seq_id, gene_type)) if gene_name not in self.gene_usage[gene_type]: self.gene_usage[gene_type][gene_name] = [] self.gene_usage[gene_type][gene_name].append( lineage.Dataset().GetSHMsBySeqName(root_seq_id, gene_type))
def _FindMostAbundantGene(self, gene_type): gene_dict = dict() # gene name -> num sequences genes = [] for seq in self.full_length_lineage.FullLengthSeqIdIter(): gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName( seq.id, gene_type) base_gene = utils.GetBaseGeneName(gene_name) genes.append(base_gene) self.gene_type_mults[gene_type] = Counter(genes)
def GetIdentifiedGenes(self): found_d_genes = set() for c in self.classifications: gene_ids = c.gene_ids if len(gene_ids) != 1: continue found_d_genes.add(utils.GetBaseGeneName(gene_ids[0])) missing_d_genes = set() for d in self.d_genes: d_base = utils.GetBaseGeneName(d.id) if d_base not in found_d_genes: missing_d_genes.add(d_base) print str(len(found_d_genes)) + " D genes are identified: " + str( ','.join([d for d in sorted(found_d_genes)])) print str(len(self.d_genes) - len(found_d_genes)) + ' D genes are missing: ' + str( ','.join([d for d in sorted(missing_d_genes)])) return found_d_genes, missing_d_genes
def _FindMostAbundantGene(self, gene_type): gene_dict = dict() # gene name -> num sequences for seq in self.full_length_lineage.FullLengthSeqIdIter(): gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName( seq.id, gene_type) base_gene = utils.GetBaseGeneName(gene_name) if base_gene not in gene_dict: gene_dict[base_gene] = 0 gene_dict[base_gene] += 1 self.gene_type_mults[gene_type] = gene_dict
def _FindMostAbundantGene(self, gene_type): gene_dict = dict() # gene name -> num sequences for seq in self.full_length_lineage.FullLengthSeqIdIter(): gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName(seq.id, gene_type) base_name = utils.GetBaseGeneName(gene_name) if base_name not in gene_dict: gene_dict[base_name] = 0 gene_dict[base_name] += 1 most_freq_gene = max(gene_dict.iteritems(), key=operator.itemgetter(1))[0] # print "Most frequent " + str(gene_type.name) + ' gene: ' + most_freq_gene + ' (' + str(gene_dict[most_freq_gene]) + ' sequences)' return max(gene_dict.iteritems(), key=operator.itemgetter(1))[0]
def _CreateSubstringSegmentClassification(self, segment_type, segment, d_indices): if segment_type == SegmentType.AMBIGUOUS_GENE: return DSegmentClassification(segment_type, segment, '', [ utils.GetBaseGeneName(self.d_genes[ind].id) for ind in d_indices ]) main_d_gene = self.d_genes[d_indices[0]] segment_alignment_seq = self._GetAlignmentSeqForSubstring( segment, main_d_gene.seq) return DSegmentClassification(segment_type, segment_alignment_seq, main_d_gene.seq, [main_d_gene.id])
def _GetRootSeq(self, lineage, abundant_v): num_shms = sys.maxint root_seq = '' for seq in lineage.FullLengthSeqIdIter(): if utils.GetBaseGeneName( self.dataset.GetGeneHitBySeqName( seq.id, dataset.AnnotatedGene.V)) != abundant_v: continue cur_num_shms = len(self.dataset.GetVSHMsOutsideCDR3(seq.id)) + len( self.dataset.GetJSHMsOutsideCDR3(seq.id)) if cur_num_shms < num_shms: num_shms = cur_num_shms root_seq = seq return root_seq
def _SegmentIsAmbiguousGene(self, d_indices): d_base_names = set( [utils.GetBaseGeneName(self.d_genes[ind].id) for ind in d_indices]) return len(d_base_names) > 1