def get_mutation_statistics(self, disease_name, mutation_type): study_ids = self._get_studies_from_disease_name(disease_name) if not study_ids: raise DiseaseNotFoundException gene_list = self._get_gene_list() mutation_dict = {} num_case = 0 for study_id in study_ids: num_case += cbio_client.get_num_sequenced(study_id) mutations = cbio_client.get_mutations(study_id, gene_list, mutation_type) for g, a in zip(mutations['gene_symbol'], mutations['amino_acid_change']): mutation_effect = self.find_mutation_effect(g, a) if mutation_effect is None: mutation_effect_key = 'other' else: mutation_effect_key = mutation_effect try: mutation_dict[g][0] += 1.0 mutation_dict[g][1][mutation_effect_key] += 1 except KeyError: effect_dict = {'activate': 0.0, 'deactivate': 0.0, 'other': 0.0} effect_dict[mutation_effect_key] += 1.0 mutation_dict[g] = [1.0, effect_dict] # Normalize entries for k, v in mutation_dict.items(): mutation_dict[k][0] /= num_case effect_sum = numpy.sum(list(v[1].values())) mutation_dict[k][1]['activate'] /= effect_sum mutation_dict[k][1]['deactivate'] /= effect_sum mutation_dict[k][1]['other'] /= effect_sum return mutation_dict
def test_get_num_sequenced(): num_case = cbio_client.get_num_sequenced('paad_tcga') assert(num_case > 0)
def get_mutation_statistics(self, disease_name, mutation_type): study_ids = self._get_studies_from_disease_name(disease_name) if not study_ids: raise DiseaseNotFoundException gene_list = self._get_gene_list() mutation_dict = {} num_case = 0 logger.info("Found %d studies and a gene_list of %d elements." % (len(study_ids), len(gene_list))) mut_patt = re.compile("([A-Z]+)(\d+)([A-Z]+)") for study_id in study_ids: try: num_case += cbio_client.get_num_sequenced(study_id) except Exception as e: continue mutations = cbio_client.get_mutations(study_id, gene_list, mutation_type) if not mutations['gene_symbol']: logger.info("Found no genes for %s." % study_id) continue # Create agents from the results of the search. agent_dict = {} for g, a in zip(mutations['gene_symbol'], mutations['amino_acid_change']): m = mut_patt.match(a) if m is None: logger.warning("Unrecognized residue: %s" % a) continue res_from, pos, res_to = m.groups() try: mut = MutCondition(pos, res_from, res_to) except InvalidResidueError: logger.warning("Invalid residue: %s or %s." % (res_from, res_to)) continue ag = Agent(g, db_refs={'HGNC': hgnc_client.get_hgnc_id(g)}, mutations=[mut]) if g not in agent_dict.keys(): agent_dict[g] = [] agent_dict[g].append(ag) if not agent_dict: return {} # Get the most mutated gene. top_gene = max(agent_dict.keys(), key=lambda k: len(agent_dict[k])) logger.info("Found %d genes, with top hit %s for %s." % (len(agent_dict.keys()), top_gene, study_id)) if top_gene not in mutation_dict.keys(): effect_dict = {'activate': 0, 'deactivate': 0, 'other': 0} mutation_dict[top_gene] = {'count': 0, 'effects': effect_dict, 'total_effects': 0, 'agents': []} for agent in agent_dict[top_gene]: # Get the mutations effects for that gene. mutation_effect = self.find_mutation_effect(agent) if mutation_effect is None: mutation_effect_key = 'other' else: mutation_effect_key = mutation_effect mutation_dict[top_gene]['count'] += 1 mutation_dict[top_gene]['effects'][mutation_effect_key] += 1 mutation_dict[top_gene]['agents'].append(agent) # Calculate normalized entries for k, v in mutation_dict.items(): mutation_dict[k]['fraction'] = v['count'] / num_case for eff in v['effects'].copy().keys(): v['effects'][eff + '_percent'] = v['effects'][eff] / v['count'] return mutation_dict
def test_get_num_sequenced(): num_case = cbio_client.get_num_sequenced('paad_tcga') assert num_case > 0