def expand_liquid_oncotree(onco_tree): """ Expand the _LIQUID_ oncotree node to all of its children :param onco_tree: Digraph of the Oncotree :returns liquid_children: All liquid tumor types in the Oncotree solid_children: All tumor types in the Oncotree minus "liquid_children" """ # build the nodes for liquid. node1 = oncotreenx.lookup_text(onco_tree, "Lymph") node2 = oncotreenx.lookup_text(onco_tree, "Blood") nodes1 = list(nx.dfs_tree(onco_tree, node1)) nodes2 = list(nx.dfs_tree(onco_tree, node2)) nodes = list(set(nodes1).union(set(nodes2))) primary_tumors = get_primary_tumors() liquid_children_codes = [] for n in nodes: liquid_children_codes.extend(list(nx.dfs_tree(onco_tree, n))) liquid_children = [onco_tree.node[nn]['text'] for nn in liquid_children_codes if onco_tree.node[nn]['text'].strip() not in primary_tumors] # solid nodes are all other nodes all_nodes = set(list(onco_tree.nodes())) tmp_nodes = all_nodes - set(nodes) solid_children_codes = list(tmp_nodes) solid_children = [onco_tree.node[nn]['text'] for nn in solid_children_codes if onco_tree.node[nn]['text'].strip() not in primary_tumors] return liquid_children, solid_children
def expand_liquid_oncotree(onco_tree): """ Expand the _LIQUID_ oncotree node to all of its children :param onco_tree: Digraph of the Oncotree :returns liquid_children: All liquid tumor types in the Oncotree solid_children: All tumor types in the Oncotree minus "liquid_children" """ # build the nodes for liquid. node1 = oncotreenx.lookup_text(onco_tree, "Lymph") node2 = oncotreenx.lookup_text(onco_tree, "Blood") nodes1 = list(nx.dfs_tree(onco_tree, node1)) nodes2 = list(nx.dfs_tree(onco_tree, node2)) nodes = list(set(nodes1).union(set(nodes2))) primary_tumors = get_primary_tumors() liquid_children_codes = [] for n in nodes: liquid_children_codes.extend(list(nx.dfs_tree(onco_tree, n))) liquid_children = [onco_tree.nodes[nn]['text'] for nn in liquid_children_codes if onco_tree.nodes[nn]['text'].strip() not in primary_tumors] # solid nodes are all other nodes all_nodes = set(list(onco_tree.nodes())) tmp_nodes = all_nodes - set(nodes) solid_children_codes = list(tmp_nodes) solid_children = [onco_tree.nodes[nn]['text'] for nn in solid_children_codes if onco_tree.nodes[nn]['text'].strip() not in primary_tumors] return liquid_children, solid_children
def extract_cancer_types(self): """ Returns all cancer types located in the match tree :param g: DiGraph match tree :return: List of cancer types """ diagnoses = [] cancer_types_expanded = [] primary_cancer_types = [] excluded_cancer_types = [] onco_tree = oncotreenx.build_oncotree(file_path=TUMOR_TREE) liquid_children_txt, solid_children_txt = expand_liquid_oncotree(onco_tree) # iterate through the graph for node_id in list(nx.dfs_postorder_nodes(self.g, source=1)): node = self.g.node[node_id] if node['type'] == 'clinical': if 'oncotree_primary_diagnosis' in node['value']: diagnosis = node['value']['oncotree_primary_diagnosis'] n = oncotreenx.lookup_text(onco_tree, diagnosis.replace('!', '')) children = list(nx.dfs_tree(onco_tree, n)) if diagnosis == '_SOLID_': children_txt = solid_children_txt primary_parent = 'All Solid Tumors' parents_txt = ['All Solid Tumors'] elif diagnosis == '_LIQUID_': children_txt = liquid_children_txt primary_parent = 'All Liquid Tumors' parents_txt = ['All Liquid Tumors'] else: children_txt = [onco_tree.node[nn]['text'] for nn in children] if n is not None: parents, parents_txt, primary_parent = get_parents(onco_tree, n) else: parents_txt = [] primary_parent = '' diagnoses.append(diagnosis) if diagnosis.startswith('!'): excluded_cancer_types.append(diagnosis.replace('!', '')) excluded_cancer_types.extend(children_txt) else: primary_tumors = get_primary_tumors() cancer_types_expanded.append(parse_diagnosis(diagnosis)) cancer_types_expanded.extend(children_txt) cancer_types_expanded.extend([i for i in parents_txt if i.split()[0] not in primary_tumors]) primary_cancer_types.append(primary_parent) return { 'diagnoses': list(set(i for i in diagnoses if i.strip() != 'root')), 'cancer_types_expanded': list(set(i for i in cancer_types_expanded if i.strip() != 'root')), 'primary_cancer_types': list(set(i for i in primary_cancer_types if i.strip() != 'root')), 'excluded_cancer_types': list(set(i for i in excluded_cancer_types if i.strip() != 'root')) }
def extract_cancer_types(self): """ Returns all cancer types located in the match tree :param g: DiGraph match tree :return: List of cancer types """ diagnoses = [] cancer_types_expanded = [] primary_cancer_types = [] excluded_cancer_types = [] onco_tree = oncotreenx.build_oncotree(file_path=TUMOR_TREE) liquid_children_txt, solid_children_txt = expand_liquid_oncotree(onco_tree) # iterate through the graph for node_id in list(nx.dfs_postorder_nodes(self.g, source=1)): node = self.g.nodes[node_id] if node['type'] == 'clinical': if 'oncotree_primary_diagnosis' in node['value']: diagnosis = node['value']['oncotree_primary_diagnosis'] n = oncotreenx.lookup_text(onco_tree, diagnosis.replace('!', '')) children = list(nx.dfs_tree(onco_tree, n)) if diagnosis == '_SOLID_': children_txt = solid_children_txt primary_parent = 'All Solid Tumors' parents_txt = ['All Solid Tumors'] elif diagnosis == '_LIQUID_': children_txt = liquid_children_txt primary_parent = 'All Liquid Tumors' parents_txt = ['All Liquid Tumors'] else: children_txt = [onco_tree.nodes[nn]['text'] for nn in children] if n is not None: parents, parents_txt, primary_parent = get_parents(onco_tree, n) else: parents_txt = [] primary_parent = '' diagnoses.append(diagnosis) if diagnosis.startswith('!'): excluded_cancer_types.append(diagnosis.replace('!', '')) excluded_cancer_types.extend(children_txt) else: primary_tumors = get_primary_tumors() cancer_types_expanded.append(parse_diagnosis(diagnosis)) cancer_types_expanded.extend(children_txt) cancer_types_expanded.extend([i for i in parents_txt if i.split()[0] not in primary_tumors]) primary_cancer_types.append(primary_parent) return { 'diagnoses': list(set(i for i in diagnoses if i.strip() != 'root')), 'cancer_types_expanded': list(set(i for i in cancer_types_expanded if i.strip() != 'root')), 'primary_cancer_types': list(set(i for i in primary_cancer_types if i.strip() != 'root')), 'excluded_cancer_types': list(set(i for i in excluded_cancer_types if i.strip() != 'root')) }
def test_text_lu(self): # get the ancestor. p = lookup_text(self.g, "Adrenal Gland") # make sure it is correct. assert 'ADRENAL_GLAND' == p
def get_histology_type(self, cancer_type_text): """Convert oncotree code to histology type""" if cancer_type_text in histology_type_dict: return histology_type_dict[cancer_type_text] try: metamaintype = self.oncotree.node[oncotreenx.lookup_text( self.oncotree, cancer_type_text)]['metamaintype'] except KeyError: print '## WARNING: \'%s\' is not a valid oncotree text input.' \ ' Sample was removed from analysis' % cancer_type_text return 'removeme' if metamaintype in histology_type_dict: return histology_type_dict[metamaintype] else: return 'removeme'
def prepare_criteria(item): onco_tree = oncotreenx.build_oncotree(settings.DATA_ONCOTREE_FILE) c = {} clin_txt_1 = "" clin_txt_2_gender = "" clin_txt_2_age = "" if 'clinical_filter' in item: clin_tmp = json.dumps(item['clinical_filter']) for key, val in REREPLACEMENTS.items(): clin_tmp = clin_tmp.replace(key, val) c = json.loads(clin_tmp) if 'GENDER' in item['clinical_filter']: clin_txt_2_gender = item['clinical_filter']['GENDER'] if 'BIRTH_DATE' in item['clinical_filter']: op = next(iter(item['clinical_filter']['BIRTH_DATE'].keys())) val = next(iter(item['clinical_filter']['BIRTH_DATE'].values())) try: val = datetime.datetime.strptime(val.replace(" GMT", ""), '%a, %d %b %Y %H:%M:%S') except ValueError: val = dateutil.parser.parse(val) # compute the age. today = datetime.date.today() tmp = today.year - val.year - ((today.month, today.day) < (val.month, val.day - 1)) val = tmp if op.count("gte") > 0: clin_txt_2_age = "< %s" % val else: clin_txt_2_age = "> %s" % val # parse date-times. for key in ['BIRTH_DATE', 'REPORT_DATE']: if key not in c: continue # extract the expression value. lkey, lval = next(iter(c[key].keys())), next(iter(c[key].values())) try: c[key][lkey] = datetime.datetime.strptime( lval.replace(" GMT", ""), '%a, %d %b %Y %H:%M:%S') except ValueError: c[key][lkey] = dateutil.parser.parse(lval) # expand oncotree if 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME' in item['clinical_filter']: txt = item['clinical_filter']['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] if txt == "_LIQUID_" or txt == "_SOLID_": node1 = oncotreenx.lookup_text(onco_tree, "Lymph") node2 = oncotreenx.lookup_text(onco_tree, "Blood") nodes1 = list(nx.dfs_tree(onco_tree, node1)) nodes2 = list(nx.dfs_tree(onco_tree, node2)) nodes = list(set(nodes1).union(set(nodes2))) if txt == "_SOLID_": all_nodes = set(list(onco_tree.nodes())) tmp_nodes = all_nodes - set(nodes) nodes = list(tmp_nodes) clin_txt_1 = "%s cancers" % txt.replace("_", "").title() else: clin_txt_1 = "%s" % txt node = oncotreenx.lookup_text(onco_tree, txt) if onco_tree.has_node(node): nodes = list(nx.dfs_tree(onco_tree, node)) nodes_txt = [onco_tree.node[n]['text'] for n in nodes] c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] = {'$in': nodes_txt} g = {} gen_txt = [] if 'genomic_filter' in item: gen_tmp = json.dumps(item['genomic_filter']) for key, val in REREPLACEMENTS.items(): gen_tmp = gen_tmp.replace(key, val) g = json.loads(gen_tmp) # add TRUE_HUGO_SYMBOL value mutational signature filter queries if 'TRUE_HUGO_SYMBOL' in g and g['TRUE_HUGO_SYMBOL'] == {'$in': ['']}: g['TRUE_HUGO_SYMBOL'] = None sv_test = False mut_test = False cnv_test = False if 'VARIANT_CATEGORY' in item['genomic_filter']: variant_category = item['genomic_filter']['VARIANT_CATEGORY'] if isinstance(variant_category, dict): for x in variant_category.values(): if "SV" in set(x): sv_test = True if "CNV" in set(x): cnv_test = True if "MUTATION" in set(x): mut_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'SV': sv_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'CNV': cnv_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'MUTATION': mut_test = True # build text. exon_txt = "" protein_txt = "" if mut_test: gen_txt.append("Mutation") if 'TRUE_EXON_CHANGE' in item['genomic_filter']: exon_txt = item['genomic_filter']['TRUE_EXON_CHANGE'] if 'TRUE_PROTEIN_CHANGE' in item['genomic_filter']: protein_txt = item['genomic_filter']['TRUE_PROTEIN_CHANGE'] if cnv_test: if 'CNV_CALL' in g: if isinstance(g['CNV_CALL'], dict): gen_txt += next(iter(g['CNV_CALL'].values())) else: gen_txt.append(g['CNV_CALL']) if sv_test: gen_txt.append("Structural rearrangement") if 'MMR_STATUS' in item['genomic_filter']: gen_txt.append(item['genomic_filter']['MMR_STATUS']) if 'TABACCO_STATUS' in item['genomic_filter']: gen_txt.append('Tobacco Mutational Signature') if 'TEMOZOLOMIDE_STATUS' in item['genomic_filter']: gen_txt.append('Temozolomide Mutational Signature') if 'POLE_STATUS' in item['genomic_filter']: gen_txt.append('PolE Mutational Signature') if 'APOBEC_STATUS' in item['genomic_filter']: gen_txt.append('APOBEC Mutational Signature') if 'UVA_STATUS' in item['genomic_filter']: gen_txt.append('UVA Mutational Signature') clauses = [] if mut_test: clause = { 'VARIANT_CATEGORY': 'MUTATION', 'TRUE_HUGO_SYMBOL': g['TRUE_HUGO_SYMBOL'] } if 'WILDTYPE' in g: clause['WILDTYPE'] = g['WILDTYPE'] if 'TRUE_PROTEIN_CHANGE' in g: clause['TRUE_PROTEIN_CHANGE'] = g['TRUE_PROTEIN_CHANGE'] clauses.append(clause) if cnv_test: clause = { 'VARIANT_CATEGORY': 'CNV', 'TRUE_HUGO_SYMBOL': g['TRUE_HUGO_SYMBOL'], } if 'CNV_CALL' in g: clause['CNV_CALL'] = g['CNV_CALL'] if 'WILDTYPE' in g: clause['WILDTYPE'] = g['WILDTYPE'] clauses.append(clause) if sv_test: true_hugo = item['genomic_filter']['TRUE_HUGO_SYMBOL'] if isinstance(true_hugo, dict): genes = next(iter(true_hugo.values())) else: genes = [true_hugo] to_add = list() for gene in genes: if gene in synonyms: to_add += synonyms[gene] genes = genes + to_add abc = '|'.join([ rf"(.*\W{gene}\W.*)|(^{gene}\W.*)|(.*\W{gene}$)" for gene in genes ]) clauses.append({'STRUCTURAL_VARIANT_COMMENT': {"$regex": abc}}) clauses.append({'LEFT_PARTNER_GENE': {'$in': genes}}) clauses.append({'RIGHT_PARTNER_GENE': {'$in': genes}}) if len(clauses) > 0: g = {"$or": clauses} for key in item['genomic_filter']: special_clauses = { 'STRUCTURAL_VARIANT_COMMENT', 'VARIANT_CATEGORY', 'TRUE_HUGO_SYMBOL', 'CNV_CALL', 'WILDTYPE', 'TRUE_PROTEIN_CHANGE' } if key in special_clauses: continue g[key] = item['genomic_filter'][key] get_recursively(g, "GMT") if 'TRUE_HUGO_SYMBOL' in item['genomic_filter']: if isinstance(item['genomic_filter']['TRUE_HUGO_SYMBOL'], dict): genes = next( iter(item['genomic_filter']['TRUE_HUGO_SYMBOL'].values())) else: genes = [item['genomic_filter']['TRUE_HUGO_SYMBOL']] genes = [str(i) for i in genes] genes = ', '.join(genes) if len(gen_txt) > 1: gen_txt = "%s: %s" % (genes, ', '.join(gen_txt)) else: if exon_txt == "" and protein_txt == "": gen_txt = "%s %s" % (genes, ', '.join(gen_txt)) elif exon_txt != "": gen_txt = "%s exon %s" % (genes, exon_txt) else: gen_txt = "%s %s" % (genes, protein_txt) return c, g, (gen_txt, [clin_txt_1, clin_txt_2_age, clin_txt_2_gender])
def prepare_criteria(item): onco_tree = oncotreenx.build_oncotree(settings.DATA_ONCOTREE_FILE) c = {} clin_txt_1 = "" clin_txt_2_gender = "" clin_txt_2_age = "" if 'clinical_filter' in item: clin_tmp = json.dumps(item['clinical_filter']) for key, val in REREPLACEMENTS.items(): clin_tmp = clin_tmp.replace(key, val) c = json.loads(clin_tmp) if 'GENDER' in item['clinical_filter']: clin_txt_2_gender = item['clinical_filter']['GENDER'] if 'BIRTH_DATE' in item['clinical_filter']: op = item['clinical_filter']['BIRTH_DATE'].keys()[0] val = item['clinical_filter']['BIRTH_DATE'].values()[0] try: val = datetime.datetime.strptime(val.replace(" GMT", ""), '%a, %d %b %Y %H:%M:%S') except ValueError: val = dateutil.parser.parse(val) # compute the age. today = datetime.date.today() tmp = today.year - val.year - ((today.month, today.day) < (val.month, val.day - 1)) val = tmp if op.count("gte") > 0: clin_txt_2_age = "< %s" % val else: clin_txt_2_age = "> %s" % val # parse date-times. for key in ['BIRTH_DATE', 'REPORT_DATE']: if key not in c: continue # extract the expression value. lkey, lval = c[key].keys()[0], c[key].values()[0] try: c[key][lkey] = datetime.datetime.strptime(lval.replace(" GMT", ""), '%a, %d %b %Y %H:%M:%S') except ValueError: c[key][lkey] = dateutil.parser.parse(lval) # expand oncotree if 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME' in item['clinical_filter']: txt = item['clinical_filter']['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] if txt == "_LIQUID_" or txt == "_SOLID_": node1 = oncotreenx.lookup_text(onco_tree, "Lymph") node2 = oncotreenx.lookup_text(onco_tree, "Blood") nodes1 = list(nx.dfs_tree(onco_tree, node1)) nodes2 = list(nx.dfs_tree(onco_tree, node2)) nodes = list(set(nodes1).union(set(nodes2))) if txt == "_SOLID_": all_nodes = set(list(onco_tree.nodes())) tmp_nodes = all_nodes - set(nodes) nodes = list(tmp_nodes) clin_txt_1 = "%s cancers" % txt.replace("_", "").title() else: clin_txt_1 = "%s" % txt node = oncotreenx.lookup_text(onco_tree, txt) if onco_tree.has_node(node): nodes = list(nx.dfs_tree(onco_tree, node)) nodes_txt = [onco_tree.node[n]['text'] for n in nodes] c['ONCOTREE_PRIMARY_DIAGNOSIS_NAME'] = {'$in': nodes_txt} g = {} gen_txt = [] if 'genomic_filter' in item: gen_tmp = json.dumps(item['genomic_filter']) for key, val in REREPLACEMENTS.items(): gen_tmp = gen_tmp.replace(key, val) g = json.loads(gen_tmp) # add TRUE_HUGO_SYMBOL value mutational signature filter queries if 'TRUE_HUGO_SYMBOL' in g and g['TRUE_HUGO_SYMBOL'] == {'$in': ['']}: g['TRUE_HUGO_SYMBOL'] = None sv_test = False mut_test = False cnv_test = False if 'VARIANT_CATEGORY' in item['genomic_filter']: variant_category = item['genomic_filter']['VARIANT_CATEGORY'] if isinstance(variant_category, dict): for x in variant_category.values(): if "SV" in set(x): sv_test = True if "CNV" in set(x): cnv_test = True if "MUTATION" in set(x): mut_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'SV': sv_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'CNV': cnv_test = True elif item['genomic_filter']['VARIANT_CATEGORY'] == 'MUTATION': mut_test = True # build text. exon_txt = "" protein_txt = "" if mut_test: gen_txt.append("Mutation") if 'TRUE_EXON_CHANGE' in item['genomic_filter']: exon_txt = item['genomic_filter']['TRUE_EXON_CHANGE'] if 'TRUE_PROTEIN_CHANGE' in item['genomic_filter']: protein_txt = item['genomic_filter']['TRUE_PROTEIN_CHANGE'] if cnv_test: if 'CNV_CALL' in g: if isinstance(g['CNV_CALL'], dict): gen_txt += g['CNV_CALL'].values()[0] else: gen_txt.append(g['CNV_CALL']) if sv_test: gen_txt.append("Structural rearrangement") if 'MMR_STATUS' in item['genomic_filter']: gen_txt.append(item['genomic_filter']['MMR_STATUS']) if 'TABACCO_STATUS' in item['genomic_filter']: gen_txt.append('Tobacco Mutational Signature') if 'TEMOZOLOMIDE_STATUS' in item['genomic_filter']: gen_txt.append('Temozolomide Mutational Signature') if 'POLE_STATUS' in item['genomic_filter']: gen_txt.append('PolE Mutational Signature') if 'APOBEC_STATUS' in item['genomic_filter']: gen_txt.append('APOBEC Mutational Signature') if 'UVA_STATUS' in item['genomic_filter']: gen_txt.append('UVA Mutational Signature') clauses = [] if mut_test: clause = { 'VARIANT_CATEGORY': 'MUTATION', 'TRUE_HUGO_SYMBOL': g['TRUE_HUGO_SYMBOL'] } if 'WILDTYPE' in g: clause['WILDTYPE'] = g['WILDTYPE'] if 'TRUE_PROTEIN_CHANGE' in g: clause['TRUE_PROTEIN_CHANGE'] = g['TRUE_PROTEIN_CHANGE'] clauses.append(clause) if cnv_test: clause = { 'VARIANT_CATEGORY': 'CNV', 'TRUE_HUGO_SYMBOL': g['TRUE_HUGO_SYMBOL'], } if 'CNV_CALL' in g: clause['CNV_CALL'] = g['CNV_CALL'] if 'WILDTYPE' in g: clause['WILDTYPE'] = g['WILDTYPE'] clauses.append(clause) if sv_test: true_hugo = item['genomic_filter']['TRUE_HUGO_SYMBOL'] if isinstance(true_hugo, dict): genes = true_hugo.values()[0] else: genes = [true_hugo] to_add = list() for gene in genes: if gene in synonyms: to_add += synonyms[gene] genes = genes + to_add sv_clauses = [] for gene in genes: abc = "(.*\W%s\W.*)|(^%s\W.*)|(.*\W%s$)" % (gene, gene, gene) sv_clauses.append(re.compile(abc, re.IGNORECASE)) clause = { 'STRUCTURAL_VARIANT_COMMENT': {"$in": sv_clauses} } clauses.append(clause) if len(clauses) > 0: g = { "$or": clauses } for key in item['genomic_filter']: special_clauses = { 'STRUCTURAL_VARIANT_COMMENT', 'VARIANT_CATEGORY', 'TRUE_HUGO_SYMBOL', 'CNV_CALL', 'WILDTYPE', 'TRUE_PROTEIN_CHANGE' } if key in special_clauses: continue g[key] = item['genomic_filter'][key] get_recursively(g, "GMT") if 'TRUE_HUGO_SYMBOL' in item['genomic_filter']: if isinstance(item['genomic_filter']['TRUE_HUGO_SYMBOL'], dict): genes = item['genomic_filter']['TRUE_HUGO_SYMBOL'].values()[0] else: genes = [item['genomic_filter']['TRUE_HUGO_SYMBOL']] genes = [str(i) for i in genes] genes = ', '.join(genes) if len(gen_txt) > 1: gen_txt = "%s: %s" % (genes, ', '.join(gen_txt)) else: if exon_txt == "" and protein_txt == "": gen_txt = "%s %s" % (genes, ', '.join(gen_txt)) elif exon_txt != "": gen_txt = "%s exon %s" % (genes, exon_txt) else: gen_txt = "%s %s" % (genes, protein_txt) return c, g, (gen_txt, [clin_txt_1, clin_txt_2_age, clin_txt_2_gender])