def parse(self): # The following genes are taken from Table 1 of the paper. # Validated EGFR-sensitizing genes. sli_list = [] egfr = 'EGFR' egfr_id = self.get_ncbigene_curie(egfr) # Note -- I removed LOC63920 and LOC284393 from this list, I could not identify them in HGNC table1_symbols = {'ABL1', 'AKT2', 'ANXA6', 'ARF4', 'ARF5', 'ASCL2', 'BCAR1', 'CALM1', 'CBLC', 'CCND1', 'CD59', 'CDH3', 'CXCL12', 'DCN', 'DDR2', 'DIXDC1', 'DLG4', 'DUSP4', 'DUSP6', 'DUSP7', 'EPHA5', 'ERBB3', 'FER', 'FGFR2', 'FLNA', 'GRB7', 'HSPA9', 'INPPL1', 'KLF10', 'LTK', 'MAP3K1', 'MAPK1', 'MATK', 'NEDD9', 'NOTCH2', 'PIK3R1', 'PIK3R2', 'PIN1', 'PKN2', 'PLSCR1', 'PPIA', 'PRKACB', 'PRKCD', 'PRKCE', 'PRKCZ', 'PTPRF', 'RAC1', 'RAPGEF1', 'RASA3', 'RET', 'RPS6KA5', 'SC4MOL', 'SH2D3C', 'SHC1', 'SMAD2', 'SOS2', 'STAT3', 'TBL1Y', 'VAV3'} for geneB in table1_symbols: geneB = self.get_current_symbol(geneB) if geneB in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneB) else: raise ValueError("Could not get NCBI id for gene \"%s\" in Blomen 2015" % geneB) sli = SyntheticLethalInteraction(gene_A_symbol=egfr, gene_A_id=egfr_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=SlConstants.INHIBITORY_ANTIBODY, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.N_A, effect_size=0, cell_line=SlConstants.A431_CELL, cellosaurus_id=SlConstants.A431_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) # The authors have one additional SLI # Analysis based on the Chou-Talalay coefficient of interaction showed that the small-molecule AURKA inhibitor # PHA-680632 (29) synergized with erlotinib in reducing cell viability of both A431 and HCT116 cells (Fig. 6B). # In HCT116 cells, we found strong synergy (coefficient of interaction values <0.5) between cetuximab and either # PHA-680632 or another AURKA inhibitor, C1368 aurka = 'AURKA' aurka_id = self.get_ncbigene_curie(aurka) sli = SyntheticLethalInteraction(gene_A_symbol=egfr, gene_A_id=egfr_id, gene_B_symbol=aurka, gene_B_id=aurka_id, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.PHARMACEUTICAL, effect_type=SlConstants.N_A, effect_size=0, cell_line=SlConstants.HCT_116, cellosaurus_id=SlConstants.HCT_116_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): """ While POLQ serves as a positive control, FEN1 and APEX2 represent novel B2SL genes and novel potential drug targets in BRCA-deficient tumors. The authors do not concretely name the entire list of SLIs, so we restrict ourselves to the three that are investigated in detail. FEN1 was also validated for BRCA1 """ brca2 = 'BRCA2' brca2_id = self.get_ncbigene_curie(brca2) geneBlist = {'POLQ', 'FEN1', 'APEX2'} sli_list = [] for geneB in geneBlist: geneB_id = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=brca2, gene_A_id=brca2_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.CRISPR_CAS9, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.PEO1_CELL, cellosaurus_id=SlConstants.PEO1_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.MULTICOLOR_COMPETITION_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) brca1 = 'BRCA1' brca1_id = self.get_ncbigene_curie(brca1) geneBlist = {'FEN1', 'APEX2'} for geneB in geneBlist: geneB_id = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=brca1, gene_A_id=brca1_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.CRISPR_CAS9, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line='BRCA1 isogenic RPE1 cell line', cellosaurus_id=SlConstants.N_A, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.MULTICOLOR_COMPETITION_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): sl_genes = {'SGK2', 'PAK3'} tp53 = 'TP53' pmid = '20616055' tp53id = self.get_ncbigene_curie(tp53) sli_list = [] for geneB in sl_genes: geneBid = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=tp53, gene_A_id=tp53id, gene_B_symbol=geneB, gene_B_id=geneBid, gene_A_pert=SlConstants.DEGRADATION, gene_B_pert=SlConstants.SH_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line='primary human foreskin keratinocytes', cellosaurus_id=SlConstants.N_A, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): ccne1 = 'CCNE1' ccne1_id = self.get_ncbigene_curie(ccne1) sli_list = [] sligenes = { 'CDK2', 'ACAT2', 'CSE1L', 'BRCA1', 'CCNA2', 'CDC42', 'CHD2', 'DDX17', 'DUSP16', 'ENPP2', 'HNRNPA3', 'IARS2', 'MYC', 'PSMA5', 'RRM1', 'SLC35A3', 'SMC2', 'SPATA6', 'SRBD1', 'TPX2', 'TUBB', 'UBA1', 'VCP', 'XRCC2' } for geneB in sligenes: geneBId = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=ccne1, gene_A_id=ccne1_id, gene_B_symbol=geneB, gene_B_id=geneBId, gene_A_pert=SlConstants.OVEREXPRESSION, gene_B_pert=SlConstants.SH_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line='102 cancer cell lines', cellosaurus_id=SlConstants.N_A, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.SH_RNA_DEPLETION_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): sli_list = [] tp53 = 'TP53' tp53id = self.get_ncbigene_curie(tp53) # Here we take the that 18/30 candidate SLNs displayed <50% relative viability # correct symbol for DICER is DICER1 pos_sli = {'AMFR', 'ATM', 'CAPN9', 'DICER1', 'MACF1', 'MADCAM1', 'MCL1', 'MED21', 'MET', 'MON1B', 'PLCB4', 'RAB8B', 'RAD1', 'SRPK1', 'STAU1', 'TGFB2', 'TRPC1', 'VEGFA'} for geneB in pos_sli: genebid = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction(gene_A_symbol=tp53, gene_A_id=tp53id, gene_B_symbol=geneB, gene_B_id=genebid, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.HCT_116, cellosaurus_id=SlConstants.HCT_116_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def create_sli(self, geneB_sym): tp53 = 'TP53' tp53id = self.get_ncbigene_curie(tp53) geneB_id = self.get_ncbigene_curie(geneB_sym) tp53_perturbation = SlConstants.LOF_MUTATION gene_B_pert = SlConstants.SI_RNA cell_line = SlConstants.HCT_116 cellosaurus_id = SlConstants.HCT_116_CELLOSAURUS cancer_type = SlConstants.COLON_CARCINOMA ncit_id = SlConstants.COLON_CARCINOMA_NCIT assay = SlConstants.RNA_INTERFERENCE_ASSAY sli = SyntheticLethalInteraction(gene_A_symbol=tp53, species_id="10090", gene_A_id=tp53id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=tp53_perturbation, gene_B_pert=gene_B_pert, effect_type="n/a", effect_size="n/a", cell_line=cell_line, cellosaurus_id=cellosaurus_id, cancer_type=cancer_type, ncit_id=ncit_id, assay=assay, pmid=self.pmid, SL=True) return sli
def parse(self): """ Wnt agonist LY2090314, which mimics Wnt activation by inhibiting GSK3-β (Atkinson et al., 2015), emerged as a novel class of compound that inhibited the growth of all three cohesin mutants tested. RAD21, SMC3, and STAG2 deletion mutations in the breast epithelial cell line MCF10A resulte """ gsk3b = 'GSK3B' gsk3b_id = self.get_ncbigene_curie(gsk3b) sli_list = [] sligenes = {'RAD21', 'SMC3', 'STAG2'} for geneB in sligenes: geneBId = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=gsk3b, gene_A_id=gsk3b_id, gene_B_symbol=geneB, gene_B_id=geneBId, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.CRISPR_CAS9, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.MCF10A_CELL, cellosaurus_id=SlConstants.MCF10A_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): sli_list = [] pik3ca = 'PIK3CA' pik3ca_id = self.get_ncbigene_curie(pik3ca) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: geneBsym = self.get_current_symbol(row['Gene symbol']) geneBid = self.get_ncbigene_curie(geneBsym) px866 = float(row['PX-866']) nvpbez235 = float(row['NVP-BEZ235']) mean_fc = 0.5 * (px866+nvpbez235) sli = SyntheticLethalInteraction(gene_A_symbol=pik3ca, gene_A_id=pik3ca_id, gene_B_symbol=geneBsym, gene_B_id=geneBid, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.FOLD_CHANGE, effect_size=mean_fc, cell_line=SlConstants.U87WT_CELL, cellosaurus_id=SlConstants.U87WT_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.GROWTH_INHIBITION_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): """ For positives, we take genes with more than half of ≥ 7 siRNAs yielding > 4-fold sensitization (Figure 1b) """ self._add_negatives() # TAB2 -- current symbol for MAP3K7IP2 positive_sl = { 'ATR', 'TAB2', 'PPP2R1A', 'RNF31', 'TRAF6', 'UPF1', 'USP5' } top1 = 'TOP1' top1_id = self.get_ncbigene_curie(top1) for geneB in positive_sl: geneB_id = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=top1, gene_A_id=top1_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.MDAMB231_CELL, cellosaurus_id=SlConstants.MDAMB231_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) self.sli_list.append(sli) return self.sli_list
def parse(self): gene1_perturbation = SlConstants.PHARMACEUTICAL gene2_perturbation = 'natural (is a TSG)' assay = "pharmaceutical + siRNA" # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 4: raise ValueError( "Only got %d fields but was expecting at least 4 tab-separated fields" % len(row)) # seperate col containing multiple genes geneA_sym = row['geneAlist'].split(",") geneB_sym = row['geneB'] geneB_sym = self.get_current_symbol(geneB_sym) if geneB_sym in self.get_current_symbol(geneB_sym): geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) else: raise ValueError( "Could not find id for geneB %s in Srivasa 2016" % geneB_sym) effect = float(row['effect'].replace(",", ".")) for i in geneA_sym: i = self.get_current_symbol(i) if i in self.entrez_dict: geneA_id = "NCBIGene:{}".format( self.entrez_dict.get(i)) else: raise ValueError( "Could not find id for geneA %s in Srivasa 2016" % i) if geneA_id == geneB_id: continue # There are a few self loops in the data, but these are not SLIs, so we skip them sli = SyntheticLethalInteraction( gene_A_symbol=i, gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=gene1_perturbation, gene_B_pert=gene2_perturbation, effect_type=SlConstants.ZSCORE, effect_size=effect, cell_line=SlConstants.HELA_CELL, cellosaurus_id=SlConstants.HELA_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=assay, pmid=self.pmid, SL=True) gene_pair = GenePair(i, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): myc = 'MYC' myc_id = self.get_ncbigene_curie(myc) myc_perturbation = SlConstants.OVEREXPRESSION geneB_perturbation = SlConstants.SI_RNA assay_string = SlConstants.RNA_INTERFERENCE_ASSAY effect_type = SlConstants.LOG2_DECREASE_IN_ABUNDANCE cell_line = 'human mammary epithelial cells' cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A sli_dict = defaultdict(list) # Pseudogenes, divergent nc transcripts # DIP maps to two newer symbols (also GIF unclear_gene_symbols = { 'ATP5EP1', 'C10orf111', 'C19ORF30', 'C3ORF51', 'CG030', 'CLEC4GP1', 'CSN1S2A', 'DIP', 'DKFZP434I0714', 'DVL1L1', 'FLJ20674', 'FLJ22447', 'GIF', 'HCG27', 'HMG14P', 'IGLV@', 'LDHBP', 'OR5D2P', 'RBMXP1', 'RPL19P1' } with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) != 3: raise ValueError("Bad row with %d fields: %s" % (len(row), row)) geneBsym = row['symbol'] geneBsym = self.get_current_symbol(geneBsym) if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in unclear_gene_symbols: continue else: raise ValueError( "Could not find id for %s in Kessler 2012 " % geneBsym) medianDiffs = float(row['median.pair.diffs']) sli = SyntheticLethalInteraction( gene_A_symbol=myc, gene_A_id=myc_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=myc_perturbation, gene_B_pert=geneB_perturbation, effect_type=effect_type, effect_size=medianDiffs, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(myc, geneBsym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): geneA_symbol = 'CHEK1' geneA_id = 'NCBIGene:1111' geneA_perturbation = SlConstants.PHARMACEUTICAL gene2_perturbation = SlConstants.SI_RNA assay = SlConstants.RNA_INTERFERENCE_ASSAY effect_type = SlConstants.ZSCORE cell_line = "HeLa-Cells" cellosaurus = "CVCL_0030" cancer = "" ncit = "" # # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') # Z-Score Symbol Entrez ID Gene Name for row in csvreader: if len(row) < 3: raise ValueError("Only got %d fields but was expecting at least 3" % len(row)) geneB_sym = row['Symbol'] geneB_sym = self.get_current_symbol(geneB_sym) if geneB_sym == 'CHEK1': continue # Do not allow self-loops! if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneB_sym)) else: raise ValueError("Could not find id for gene symbol %s in Shen 2015" % geneB_sym) effect = float(row['Z-Score'].replace(",", ".")) sl_genes = ["FZR1", "RAD17", "RFC1", "BLM", "CDC73", "CDC6", "WEE1"] if geneB_sym in sl_genes: SL = True else: SL = False sli = SyntheticLethalInteraction(gene_A_symbol=geneA_symbol, gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=geneA_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=effect, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay, pmid=self.pmid, SL=SL) gene_pair = GenePair(geneA_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): # using supplemental file 1 gene1_perturbation = SlConstants.SG_RNA gene2_perturbation = SlConstants.SG_RNA assay = SlConstants.RNA_INTERFERENCE_ASSAY sli_list = [] with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 4: raise ValueError( "Only got %d fields but was expecting at least 4 tab-separated fields" % len(row)) # separate genes genes = row['Drug-target.Pairs'].split("__") geneA_sym = self.get_current_symbol(genes[0]) geneB_sym = self.get_current_symbol(genes[1]) if geneA_sym in self.entrez_dict: geneA_id = "NCBIGene:{}".format( self.entrez_dict.get(geneA_sym)) else: raise ValueError( "could not find id for gene A (%s) in Han 2017" % geneA_sym) if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) else: raise ValueError( "could not find id for gene B (%s) in Han 2017" % geneB_sym) effect = -4 # No exact value given, but authors state at least -4 for all SLIs sli = SyntheticLethalInteraction( gene_A_symbol=geneA_sym, gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=gene1_perturbation, gene_B_pert=gene2_perturbation, effect_type=SlConstants.ZSCORE, effect_size=effect, cell_line=SlConstants.K562_CELL, cellosaurus_id=SlConstants.K562_CELLOSAURUS, cancer_type=SlConstants.CHRONIC_MYELOGENOUS_LEUKEMIA, ncit_id=SlConstants.CHRONIC_MYELOGENOUS_LEUKEMIA_NCIT, assay=assay, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parseKRAS(self): """ BLM is BLM RecQ like helicase """ geneA = 'KRAS' geneAid = self.get_ncbigene_curie(geneA) fname = 'data/vizeacoumarSuppl4-PTEN.tsv' geneA_perturbation = SlConstants.ACTIVATING_MUTATION gene2_perturbation = SlConstants.SI_RNA assay_string = SlConstants.MULTICOLOR_COMPETITION_ASSAY cell_line = 'HCT 116' cellosaurus = 'CVCL_0291' cancer = SlConstants.N_A ncit = SlConstants.N_A c = 0 with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if not row['Expression'] == 'Expressed': continue geneBsym = self.get_current_symbol(row['human gene']) if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError( "Could not find iid for %s in Brough 2018 2008 " % geneBsym) conf80 = int(row['80% Confidence Interval (P<0.2)']) if conf80 == 1: c += 1 sli = SyntheticLethalInteraction( gene_A_symbol=geneA, gene_A_id=geneAid, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=geneA_perturbation, gene_B_pert=gene2_perturbation, effect_type='confidence.80%', effect_size='true', cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) self.sli_list.append(sli)
def parse(self): mycsymbol = 'MYC' effect_type = 'stddev' cell_line = 'HFF-Myc' cellosaurus = 'CVCL_Y511' sl_list = [] # The following list includes symbols that are not current but either could # not be matched or match to multiple possible candidates unclear_gene_symbols = {'MLCK'} # Gene.Symbol Accession.number Z.score.greaterthan %Viability.HFF-pB %Viability.HFF-MYC Ratio pBabe/Myc with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) != 6: raise ValueError("Bad line with %d fields: %s" % (len(row), row)) # Gene Symbol Accession number Z score (>than) %Viability HFF-pB %Viability HFF-MYC Ratio pBabe/Myc geneBsym = row['Gene.Symbol'] if geneBsym in unclear_gene_symbols: continue geneBsym = self.get_current_symbol(geneBsym) if geneBsym in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneBsym)) else: raise ValueError( "Could not find id for symbol %s in Toyoshima 2008" % geneBsym) zscore = float(row['Z.score.greaterthan']) sli = SyntheticLethalInteraction( gene_A_symbol=mycsymbol, gene_A_id=SlConstants.MYC_GENE_ID, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=SlConstants.OVEREXPRESSION, gene_B_pert=SlConstants.SI_RNA, effect_type=effect_type, effect_size=zscore, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.RNA_INTERFERENCE_ASSAY, pmid=self.pmid, SL=True) sl_list.append(sli) return sl_list
def parse(self): vhl = 'VHL' sli_list = [] unclear_gene_symbols = {'QARS', 'SARS'} # I could figure out that the following mappings are correct and unique with the HGNC website mappings = { 'ORAOV1': 'LTO1', 'VWA9': 'INTS14', 'NARFL': 'CIAO3', 'WBSCR22': 'BUD23', 'UFD1L': 'UFD1' } with open(self.fname) as f: for line in f: geneBsym = line.strip() if geneBsym in self.entrez_dict: geneBid = self.get_ncbigene_curie(geneBsym) elif geneBsym == 'DARS' or geneBsym == 'NARS' or geneBsym == 'KARS' or geneBsym == 'YARS': # A group of tRNA genes that need to have the '1' (I could map these uniquely with HGNC) geneBsym = "%s1" % geneBsym geneBid = self.get_ncbigene_curie(geneBsym) elif geneBsym in mappings: geneBsym = mappings.get(geneBsym) geneBid = self.get_ncbigene_curie(geneBsym) elif geneBsym in unclear_gene_symbols: continue else: raise ValueError("Could not find id for %s in Sun 2019" % geneBsym) sli = SyntheticLethalInteraction( gene_A_symbol=vhl, gene_A_id=SlConstants.VHL_GENE_ID, gene_B_symbol=geneBsym, gene_B_id=geneBid, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.CRISPR_CAS9, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.A498_CELL, cellosaurus_id=SlConstants.A498_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CRISPR_CAS9_INTERFERENCE_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list
def parse(self): perturbation = SlConstants.KNOCKOUT cellosuarus = SlConstants.HAP1_CELLOSAURUS assay = 'proportions.of.sense.and.antisense.insertions' sli_dict = defaultdict(list) # GENE SUMMARY PUBMED ID INTERACTING QUERY GENE with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: geneA = self.get_current_symbol(row['GENE']) if geneA in self.entrez_dict: geneA_id = "NCBIGene:{}".format( self.entrez_dict.get(geneA)) else: raise ValueError( "[ERROR] We could not find a gene id for " + geneA) geneBlist = row['INTERACTING QUERY GENE'] for geneB in geneBlist.split(';'): geneB = geneB.strip() geneB = self.get_current_symbol(geneB) if geneB in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB)) else: raise ValueError( "Could not get NCBI id for gene \"%s\" in Blomen 2015" % geneB) sli = SyntheticLethalInteraction( gene_A_symbol=geneA, gene_A_id=geneA_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=perturbation, gene_B_pert=perturbation, effect_type=SlConstants.N_A, effect_size=0, cell_line=SlConstants.HAP1_CELL, cellosaurus_id=cellosuarus, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=assay, pmid=self.pmid, SL=True) gene_pair = GenePair(geneA, geneB) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse_suppl10_11(self, fname): rb1 = 'RB1' rb1_id = SlConstants.RB1_GENE_ID rb1_perturbation = SlConstants.LOF_MUTATION gene2_perturbation = SlConstants.SI_RNA assay_string = "siMEM+penetrance" effect_type = "penetrance" cell_line = SlConstants.N_A cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: # print(row) geneBsym = self.get_current_symbol(row['target']) if ',' in geneBsym: continue # We cannot assign an effect unambiguously to one of the genes # some of the entries are like PMS2,PMS2CL if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError("Could not find id for %s in Brough 2018 2008 " % geneBsym) penetrance = int(row['Penetrance.(%)']) if penetrance >= 80: sli = SyntheticLethalInteraction(gene_A_symbol=rb1, gene_A_id=rb1_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=rb1_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=penetrance, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(rb1, geneBsym) self.sli_dict[gene_pair].append(sli)
def parse_suppl9(self): fname = 'data/brough_2012_suppl9.tsv' rb1 = 'RB1' rb1_id = SlConstants.RB1_GENE_ID rb1_perturbation = SlConstants.LOF_MUTATION gene2_perturbation = SlConstants.SI_RNA assay_string = "siMEM+penetrance" effect_type = "penetrance" cell_line = SlConstants.N_A cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: # print(row) geneBsym = self.get_current_symbol(row['symbol']) if geneBsym in self.entrez_dict: geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneBsym)) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError("Could not find iid for %s in Brough 2018 2008 " % geneBsym) penetrance = int(row['Penetrance.%']) if penetrance >= 80: sli = SyntheticLethalInteraction(gene_A_symbol=rb1, gene_A_id=rb1_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=rb1_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=penetrance, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(rb1, geneBsym) self.sli_dict[gene_pair].append(sli)
def setUp(self) -> None: self.parameters = { 'gene_A_symbol': 'VPS54', 'gene_A_id': 'NCBIGene:51542', 'gene_B_symbol': 'PTAR1', 'gene_B_id': 'NCBIGene:375743', 'gene_A_pert': 'pert1', 'gene_B_pert': 'pert2', 'effect_type': 'thisEff', 'effect_size': '20', 'cell_line': 'cellLine42', 'cellosaurus_id': 'csID1', 'cancer_type': 'melanoma', 'ncit_id': 'ncit1234', 'assay': 'thisAssay', 'pmid': '27453043', 'SL': True } self.no_getter = [] self.sli = SyntheticLethalInteraction(**self.parameters)
def create_sli(self, geneB, SL): STAG2 = 'STAG2' STAG2_id = self.get_ncbigene_curie(STAG2) geneBid = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=STAG2, gene_A_id=STAG2_id, gene_B_symbol=geneB, gene_B_id=geneBid, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.SH_RNA, cell_line=SlConstants.H4_CELL, cellosaurus_id=SlConstants.H4_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, effect_size=SlConstants.N_A, effect_type=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, SL=SL, pmid=self.pmid) return sli
def create_and_add_sli(self, geneA, geneB, geneApert, geneBpert, assay, pmid, cell=SlConstants.N_A, cellosaurus=SlConstants.N_A, cancer=SlConstants.N_A, ncit=SlConstants.N_A, effecttype=SlConstants.N_A, effectsize=SlConstants.N_A, background_dependency_status=SlConstants.N_A, background_dependency_gene_symbol=SlConstants.N_A, background_dependency_gene_id=SlConstants.N_A, sl=True): geneAid = self.get_ncbigene_curie(geneA) geneBid = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=geneA, gene_A_id=geneAid, gene_B_symbol=geneB, gene_B_id=geneBid, gene_A_pert=geneApert, gene_B_pert=geneBpert, effect_type=effecttype, effect_size=effectsize, cell_line=cell, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay, background_dependency_status=background_dependency_status, background_dependency_gene_symbol=background_dependency_gene_symbol, background_dependency_gene_id=background_dependency_gene_id, pmid=pmid, SL=sl) self.entries.append(sli)
def parse(self): """ overlapping identified ATRi co-essential genes. (significant in all three screens) RNASEH2 was validated in detail. Note, we replaced C17orf53 by HROB (gene id: 78995) MGEA5 by OGA (Gene ID: 10724) """ sig_genes = { 'RNASEH2B', 'RNASEH2A', "DSCC1", "TMEM208", "POLE3", "POLE4", "LEO1", "CNOT1", "SETD1A", "HROB", "OGA", "MCM9", "USP37", "THRAP3", "DPYS", "CKS2", "RHNO1", "HUS1" } sli_list = [] atr = 'ATR' atr_id = self.get_ncbigene_curie(atr) for geneB in sig_genes: if geneB in self.entrez_dict: geneb_id = self.get_ncbigene_curie(geneB) sli = SyntheticLethalInteraction( gene_A_symbol=atr, gene_A_id=atr_id, gene_B_symbol=geneB, gene_B_id=geneb_id, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.CRISPR_CAS9, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.N_A, cellosaurus_id=SlConstants.N_A, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CRISPR_CAS9_INTERFERENCE_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) else: raise ValueError("Could not find id for ", geneB) return sli_list
def _add_negatives(self): top1 = 'TOP1' top1_id = self.get_ncbigene_curie(top1) # header Symbol Gene_ID Rank RSA p-value FDR with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) != 5: raise ValueError( "Bad line with %d instead of 6 fields: %s" % (len(row), row)) geneB = row['Symbol'] pval = float(row['RSA.p-value']) if pval < 0.5: continue sym = self.get_current_symbol(geneB) if sym in self.entrez_dict: # We skip symbols that cannot be identified for this negative list geneB_id = self.get_ncbigene_curie(sym) if top1_id == geneB_id: continue # There is one self-loop in the data, we discard it because self-loops # cannot be SLIs sli = SyntheticLethalInteraction( gene_A_symbol=top1, gene_A_id=top1_id, gene_B_symbol=sym, gene_B_id=geneB_id, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.PVAL, effect_size=pval, cell_line=SlConstants.MDAMB231_CELL, cellosaurus_id=SlConstants.MDAMB231_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=False) self.sli_list.append(sli)
def parseLoF(self, geneA, fname): """ BLM, MUS81, PTEN, PTTG1 """ geneAid = self.get_ncbigene_curie(geneA) with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if not row['Expression'] == 'Expressed': continue geneBsym = self.get_current_symbol(row['human gene']) if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError( "Could not find iid for %s in Brough 2018 2008 " % geneBsym) conf80 = int(row['80% Confidence Interval (P<0.2)']) if conf80 == 1: sli = SyntheticLethalInteraction( gene_A_symbol=geneA, gene_A_id=geneAid, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.SI_RNA, effect_type='confidence.80%', effect_size='true', cell_line=SlConstants.HCT_116, cellosaurus_id=SlConstants.HCT_116_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.MULTICOLOR_COMPETITION_ASSAY, pmid=self.pmid, SL=True) self.sli_list.append(sli)
def get_sli(self, geneA_sym, geneA_id, geneB_sym, geneB_id): # Gene A should be eighter NRAS or KRAS. # These genes had activating mutations in the cell lines ncit = "NCIT:C3171" cancer = "Acute Myeloid Leukemia" sli = SyntheticLethalInteraction( gene_A_symbol=geneA_sym, species_id="10090", gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=SlConstants.ACTIVATING_MUTATION, gene_B_pert=SlConstants.SG_RNA, effect_type="n/a", effect_size="n/a", cell_line="n/a", cellosaurus_id="n/a", cancer_type=cancer, ncit_id=ncit, assay=SlConstants.CRISPR_CAS9_INTERFERENCE_ASSAY, pmid=self.pmid, SL=True) return sli
def get_sli(self, geneA_sym, geneA_id, geneB_sym, geneB_id): # Gene A should be eighter NRAS or KRAS. # These genes had activating mutations in the cell lines ncit = SlConstants.N_A cancer = SlConstants.N_A sli = SyntheticLethalInteraction( gene_A_symbol=geneA_sym, species_id="10090", gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=SlConstants.SI_RNA, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.N_A, cellosaurus_id=SlConstants.N_A, cancer_type=cancer, ncit_id=ncit, assay=SlConstants.MULTICOLOR_COMPETITION_ASSAY, pmid=self.pmid, SL=True) return sli
def get_sli(self, geneB_sym, pmid): # Gene A should be eighter NRAS or KRAS. # These genes had activating mutations in the cell lines braf = 'BRAF' brafID = SlConstants.BRAF_GENE_ID geneB_id = self.get_ncbigene_curie(geneB_sym) sli = SyntheticLethalInteraction(gene_A_symbol=braf, species_id="10090", gene_A_id=brafID, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=SlConstants.ACTIVATING_MUTATION, gene_B_pert=SlConstants.SH_RNA, effect_type=SlConstants.N_A, effect_size=SlConstants.N_A, cell_line=SlConstants.A375_CELL, cellosaurus_id=SlConstants.A375_CELLOSAURUS, cancer_type=SlConstants.MELANOMA, ncit_id=SlConstants.MELANOMA_NCIT, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=pmid, SL=True) return sli
def get_sli(self, geneB_sym, geneB_id, pval, slstatus): # Gene A should be eighter NRAS or KRAS. # These genes had activating mutations in the cell lines vhl = 'VHL' vhlID = SlConstants.VHL_GENE_ID sli = SyntheticLethalInteraction(gene_A_symbol=vhl, species_id="10090", gene_A_id=vhlID, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=SlConstants.LOF_MUTATION, gene_B_pert=SlConstants.SH_RNA, effect_type=SlConstants.PVAL, effect_size=pval, cell_line=SlConstants.A498_CELL, cellosaurus_id=SlConstants.A498_CELLOSAURUS, cancer_type=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA, ncit_id=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA_NCIT, assay=SlConstants.SG_RNA_DEPLETION_ASSAY, pmid=self.pmid, SL=slstatus) return sli
def parse(self): """ symbol MCF12A.Z-score HCC1143.Z-score """ sli_list = [] atr = 'ATR' atr_id = self.get_ncbigene_curie(atr) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: geneBsym = self.get_current_symbol(row['symbol']) if geneBsym == 'C9ORF96': geneBsym = 'STKLD1' geneB_id = self.get_ncbigene_curie(geneBsym) mcf12 = float(row['MCF12A.Z-score']) hcc1143 = float(row['HCC1143.Z-score']) meanz = 0.5 * (mcf12 + hcc1143) if geneBsym == atr: continue # There is one self-loop in the dataset, but we skip it, it cannot be an SLI sli = SyntheticLethalInteraction( gene_A_symbol=atr, gene_A_id=atr_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.ZSCORE, effect_size=meanz, cell_line=SlConstants.HCC1143_CELL, cellosaurus_id=SlConstants.HCC1143_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=SlConstants.CELL_VIABILITY_ASSAY, pmid=self.pmid, SL=True) sli_list.append(sli) return sli_list