def parse(self): gene1_perturbation = SlConstants.PHARMACEUTICAL gene2_perturbation = 'natural (is a TSG)' assay = "pharmaceutical + siRNA" # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 4: raise ValueError( "Only got %d fields but was expecting at least 4 tab-separated fields" % len(row)) # seperate col containing multiple genes geneA_sym = row['geneAlist'].split(",") geneB_sym = row['geneB'] geneB_sym = self.get_current_symbol(geneB_sym) if geneB_sym in self.get_current_symbol(geneB_sym): geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) else: raise ValueError( "Could not find id for geneB %s in Srivasa 2016" % geneB_sym) effect = float(row['effect'].replace(",", ".")) for i in geneA_sym: i = self.get_current_symbol(i) if i in self.entrez_dict: geneA_id = "NCBIGene:{}".format( self.entrez_dict.get(i)) else: raise ValueError( "Could not find id for geneA %s in Srivasa 2016" % i) if geneA_id == geneB_id: continue # There are a few self loops in the data, but these are not SLIs, so we skip them sli = SyntheticLethalInteraction( gene_A_symbol=i, gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=gene1_perturbation, gene_B_pert=gene2_perturbation, effect_type=SlConstants.ZSCORE, effect_size=effect, cell_line=SlConstants.HELA_CELL, cellosaurus_id=SlConstants.HELA_CELLOSAURUS, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=assay, pmid=self.pmid, SL=True) gene_pair = GenePair(i, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): myc = 'MYC' myc_id = self.get_ncbigene_curie(myc) myc_perturbation = SlConstants.OVEREXPRESSION geneB_perturbation = SlConstants.SI_RNA assay_string = SlConstants.RNA_INTERFERENCE_ASSAY effect_type = SlConstants.LOG2_DECREASE_IN_ABUNDANCE cell_line = 'human mammary epithelial cells' cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A sli_dict = defaultdict(list) # Pseudogenes, divergent nc transcripts # DIP maps to two newer symbols (also GIF unclear_gene_symbols = { 'ATP5EP1', 'C10orf111', 'C19ORF30', 'C3ORF51', 'CG030', 'CLEC4GP1', 'CSN1S2A', 'DIP', 'DKFZP434I0714', 'DVL1L1', 'FLJ20674', 'FLJ22447', 'GIF', 'HCG27', 'HMG14P', 'IGLV@', 'LDHBP', 'OR5D2P', 'RBMXP1', 'RPL19P1' } with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) != 3: raise ValueError("Bad row with %d fields: %s" % (len(row), row)) geneBsym = row['symbol'] geneBsym = self.get_current_symbol(geneBsym) if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in unclear_gene_symbols: continue else: raise ValueError( "Could not find id for %s in Kessler 2012 " % geneBsym) medianDiffs = float(row['median.pair.diffs']) sli = SyntheticLethalInteraction( gene_A_symbol=myc, gene_A_id=myc_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=myc_perturbation, gene_B_pert=geneB_perturbation, effect_type=effect_type, effect_size=medianDiffs, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(myc, geneBsym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): geneA_symbol = 'CHEK1' geneA_id = 'NCBIGene:1111' geneA_perturbation = SlConstants.PHARMACEUTICAL gene2_perturbation = SlConstants.SI_RNA assay = SlConstants.RNA_INTERFERENCE_ASSAY effect_type = SlConstants.ZSCORE cell_line = "HeLa-Cells" cellosaurus = "CVCL_0030" cancer = "" ncit = "" # # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') # Z-Score Symbol Entrez ID Gene Name for row in csvreader: if len(row) < 3: raise ValueError("Only got %d fields but was expecting at least 3" % len(row)) geneB_sym = row['Symbol'] geneB_sym = self.get_current_symbol(geneB_sym) if geneB_sym == 'CHEK1': continue # Do not allow self-loops! if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneB_sym)) else: raise ValueError("Could not find id for gene symbol %s in Shen 2015" % geneB_sym) effect = float(row['Z-Score'].replace(",", ".")) sl_genes = ["FZR1", "RAD17", "RFC1", "BLM", "CDC73", "CDC6", "WEE1"] if geneB_sym in sl_genes: SL = True else: SL = False sli = SyntheticLethalInteraction(gene_A_symbol=geneA_symbol, gene_A_id=geneA_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=geneA_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=effect, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay, pmid=self.pmid, SL=SL) gene_pair = GenePair(geneA_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): perturbation = SlConstants.KNOCKOUT cellosuarus = SlConstants.HAP1_CELLOSAURUS assay = 'proportions.of.sense.and.antisense.insertions' sli_dict = defaultdict(list) # GENE SUMMARY PUBMED ID INTERACTING QUERY GENE with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: geneA = self.get_current_symbol(row['GENE']) if geneA in self.entrez_dict: geneA_id = "NCBIGene:{}".format( self.entrez_dict.get(geneA)) else: raise ValueError( "[ERROR] We could not find a gene id for " + geneA) geneBlist = row['INTERACTING QUERY GENE'] for geneB in geneBlist.split(';'): geneB = geneB.strip() geneB = self.get_current_symbol(geneB) if geneB in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB)) else: raise ValueError( "Could not get NCBI id for gene \"%s\" in Blomen 2015" % geneB) sli = SyntheticLethalInteraction( gene_A_symbol=geneA, gene_A_id=geneA_id, gene_B_symbol=geneB, gene_B_id=geneB_id, gene_A_pert=perturbation, gene_B_pert=perturbation, effect_type=SlConstants.N_A, effect_size=0, cell_line=SlConstants.HAP1_CELL, cellosaurus_id=cellosuarus, cancer_type=SlConstants.N_A, ncit_id=SlConstants.N_A, assay=assay, pmid=self.pmid, SL=True) gene_pair = GenePair(geneA, geneB) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse_suppl10_11(self, fname): rb1 = 'RB1' rb1_id = SlConstants.RB1_GENE_ID rb1_perturbation = SlConstants.LOF_MUTATION gene2_perturbation = SlConstants.SI_RNA assay_string = "siMEM+penetrance" effect_type = "penetrance" cell_line = SlConstants.N_A cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: # print(row) geneBsym = self.get_current_symbol(row['target']) if ',' in geneBsym: continue # We cannot assign an effect unambiguously to one of the genes # some of the entries are like PMS2,PMS2CL if geneBsym in self.entrez_dict: geneB_id = self.get_ncbigene_curie(geneBsym) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError("Could not find id for %s in Brough 2018 2008 " % geneBsym) penetrance = int(row['Penetrance.(%)']) if penetrance >= 80: sli = SyntheticLethalInteraction(gene_A_symbol=rb1, gene_A_id=rb1_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=rb1_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=penetrance, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(rb1, geneBsym) self.sli_dict[gene_pair].append(sli)
def parse_suppl9(self): fname = 'data/brough_2012_suppl9.tsv' rb1 = 'RB1' rb1_id = SlConstants.RB1_GENE_ID rb1_perturbation = SlConstants.LOF_MUTATION gene2_perturbation = SlConstants.SI_RNA assay_string = "siMEM+penetrance" effect_type = "penetrance" cell_line = SlConstants.N_A cellosaurus = SlConstants.N_A cancer = SlConstants.N_A ncit = SlConstants.N_A with open(fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: # print(row) geneBsym = self.get_current_symbol(row['symbol']) if geneBsym in self.entrez_dict: geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneBsym)) elif geneBsym in self.unclear_gene_symbols: continue else: raise ValueError("Could not find iid for %s in Brough 2018 2008 " % geneBsym) penetrance = int(row['Penetrance.%']) if penetrance >= 80: sli = SyntheticLethalInteraction(gene_A_symbol=rb1, gene_A_id=rb1_id, gene_B_symbol=geneBsym, gene_B_id=geneB_id, gene_A_pert=rb1_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=penetrance, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=True) gene_pair = GenePair(rb1, geneBsym) self.sli_dict[gene_pair].append(sli)
def parse(self): parp1_symbol = 'PARP1' parp1_id = 'NCBIGene:142' parp1_perturbation = SlConstants.PHARMACEUTICAL gene2_perturbation = SlConstants.SI_RNA assays = ['competitive hybridization', 'multicolor competition assay'] assay_string = ";".join(assays) effect_type = 'stddev' cell_line = 'CAL-51' cellosaurus = 'CVCL_1110' cancer = "Breast Carcinoma" ncit = "NCIT:C4872" sli_dict = defaultdict(list) with open(self.fname) as csvfile: # SMARTpool Z score percent-siCONTROL csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 3: raise ValueError( "Bad row for Turner et al, only %d fields found (%s)" % (len(row), row)) geneB_sym = self.get_current_symbol(row['SMARTpool']) zscore = float(row['Z score']) if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) elif geneB_sym == 'IMPK': continue # could not be found in HGNC or NCBI Gene elif geneB_sym == 'FLJ34389': geneB_sym = 'MLKL' geneB_id = "NCBIGene:197259" else: if zscore > 3.0: raise ValueError( "Could not get NCBI id for gene %s in Turner 2008" % geneB_sym) else: continue # These are negative examples, we will just skiip if zscore <= -3.0: SL = True else: SL = False sli = SyntheticLethalInteraction( gene_A_symbol=parp1_symbol, gene_A_id=parp1_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=parp1_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=zscore, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=SL) gene_pair = GenePair(parp1_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): kras_symbol = 'KRAS' kras_id = 'NCBIGene:3845' kras_perturbation = SlConstants.ACTIVATING_MUTATION gene2_perturbation = 'shRNA' assays = ['competitive hybridization', 'multicolor competition assay'] assay_string = ";".join(assays) effect_type = 'stddev' cell_line = "DLD-1" cellosaurus = "CVCL_0248" cancer = "Colorectal Carcinoma" ncit = "NCIT:C2955" # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair # Symbol Accession v2SH Sequence Mean.DLD1 SD.DLD1 Mean.HCT116 SD.HCT116 sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 8: raise ValueError( "Bad line in Luo2009 with less than 8 fields") geneB_sym = self.get_current_symbol(row['Symbol']) if geneB_sym == 'CXORF40A': geneB_sym = 'EOLA1' if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) elif geneB_sym == 'FLJ34747': # This is a LINC, plus the symbol is old geneB_sym = 'LINC00547' geneB_id = 'NCBIGene:400121' elif geneB_sym == 'LOC283194': geneB_id = 'NCBIGene:283194' # an ncRNA elif geneB_sym == 'LOC285556': geneB_id = 'NCBIGene:285556' elif geneB_sym == 'LOC149654' or geneB_sym == 'LOC730000': continue # Could not find these in NCBI Gene or HCNG else: raise ValueError( "Could not get NCBI id for gene %s in Luo2009" % geneB_sym) stddev = float(row['SD.DLD1']) # float(fields[5]) SL = True # All data in this set is True # TODO CHECK sli = SyntheticLethalInteraction( gene_A_symbol=kras_symbol, gene_A_id=kras_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=kras_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=stddev, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=SL) gene_pair = GenePair(kras_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): kras_symbol = 'KRAS' kras_id = SlConstants.KRAS_GENE_ID kras_perturbation = SlConstants.ACTIVATING_MUTATION # activating_mutation gene2_perturbation = SlConstants.SI_RNA # 'siRNA' assay_string = SlConstants.RNA_INTERFERENCE_ASSAY effect_type = 'stddev' cell_line = SlConstants.HCT_116 cellosaurus = SlConstants.HCT_116_CELLOSAURUS cancer = SlConstants.COLORECTAL_CARCINOMA ncit = SlConstants.COLORECTAL_CARCINOMA_NCIT sli_dict = defaultdict(list) # Immunoglobulin or multiple mapping old symbols # COAS3, CES4, POM121L1, MYCL2 are aliases for a pseudogene unclear_gene_symbols = { 'MAD', 'IGHG4', 'DKFZp434C1418', 'COAS3', 'HNT', 'CES4', 'SAS', 'HLA-DRB3', 'LOC90557', 'POM121L1', 'MLL2', '37499', 'MYCL2', 'CAMKIINALPHA', 'TGIF', 'PCDHA2', 'PCDHA9' } # GeneID Locus.ID Accession HCT-116.Z-score HKE-3.Z-score D.Z-score with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) != 6: raise ValueError("Line has %d fields (should have 6): %s" % (len(row), row)) geneB_sym = row['GeneID'] # F[0] geneB_sym = self.get_current_symbol(geneB_sym) locusID = row['Locus.ID'] # F[1] accession = row['Accession'] # F[2] HCT116_zscore = float(row['HCT-116.Z-score']) # float(F[3]) HKE3_zscore = float(row['HKE-3.Z-score']) # float(F[4]) delta_zscore = float(row['D.Z-score']) if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format( self.entrez_dict.get(geneB_sym)) elif geneB_sym == 'C9ORF96': geneB_sym = 'STKLD1' elif geneB_sym in unclear_gene_symbols: continue elif delta_zscore < 2: continue # one of the many negative samples, we can skip it if it cannot be mapped else: raise ValueError( "Could not find id for gene %s in Steckel 2012" % geneB_sym) if geneB_sym == "KRAS": continue # This was an internal control! if delta_zscore >= 3.3 and HKE3_zscore < 2: SL = True else: SL = False sli = SyntheticLethalInteraction( gene_A_symbol=kras_symbol, gene_A_id=kras_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=kras_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=HCT116_zscore, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=cancer, ncit_id=ncit, assay=assay_string, pmid=self.pmid, SL=SL) gene_pair = GenePair(kras_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): # because of the experiment, geneA is always VHL. vhl_symbol = 'VHL' vhl_id = SlConstants.VHL_GENE_ID vhl_perturbation = SlConstants.LOF_MUTATION gene2_perturbation = SlConstants.SH_RNA assays = [SlConstants.COMPETITIVE_HYBRIDIZATION, SlConstants.MULTICOLOR_COMPETITION_ASSAY] effect_type = 'differential_viability' cell_786O = "786-0" cellosaurus_786O = "CVCL_1051" cell_RCC4 = "RCC4" cellosaurus_RCC4 = "CVCL_0498" # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair sli_dict = defaultdict(list) # The following list includes symbols that are not current but either could # not be matched or match to multiple possible candidates unclear_gene_symbols = {'PITSLRE', 'TAK1', 'PKD3', 'CAMLCK', 'MAPAPK3', 'CK1E', 'CK2A2', 'PDGRFB', 'ZC1/HGK'} # gene differential cell table with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: if len(row) < 4: raise ValueError("Only got %d fields but was expecting 4" % len(row)) genesy = row['gene'].upper() geneB_sym = self.get_current_symbol(genesy) if geneB_sym == "IRR" or geneB_sym == "HER4": continue if geneB_sym in unclear_gene_symbols: continue # Symbol could be either CDK11A or CDK11B if geneB_sym in self.entrez_dict: geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneB_sym)) else: raise ValueError("Could not find id for %s in Bommi 2008" % geneB_sym) effect = float(row['differential']) cell = row['cell'] if cell == 'RCC4': cell_line = cell_RCC4 cellosaurus = cellosaurus_RCC4 elif cell == '786-0': cell_line = cell_786O cellosaurus = cellosaurus_786O else: raise ValueError("Did not recognize cell type '%s'" % cell) table = row['table'] assay_string = "differential viability assay {}({})".format(cell, table) SL = True # All data in this set is True # TODO CHECK sli = SyntheticLethalInteraction(gene_A_symbol=vhl_symbol, gene_A_id=vhl_id, gene_B_symbol=geneB_sym, gene_B_id=geneB_id, gene_A_pert=vhl_perturbation, gene_B_pert=gene2_perturbation, effect_type=effect_type, effect_size=effect, cell_line=cell_line, cellosaurus_id=cellosaurus, cancer_type=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA, ncit_id=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA_NCIT, assay=assay_string, pmid=self.pmid, SL=SL) gene_pair = GenePair(vhl_symbol, geneB_sym) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list
def parse(self): geneA = 'ATR' geneAid = 'NCBIGene:545' sli_dict = defaultdict(list) with open(self.fname) as csvfile: csvreader = csv.DictReader(csvfile, delimiter='\t') for row in csvreader: geneB = row['Gene Symbol'] geneB = self.get_current_symbol(geneB) if geneB == 'ATR': continue # Self interaction, not a SLI! # A few special cases -- capitalization is not correct in the HGNC file if geneB == 'C10ORF119': geneB = 'MCMBP' elif geneB == 'C15ORF20': geneB = 'PIF1' elif geneB == 'CXORF53': geneB = 'BRCC3' if geneB in self.entrez_dict: geneBid = self.get_ncbigene_curie(geneB) else: raise ValueError("Could not find id for gene %s in Mohni 2014" % geneB) mock1 = float(row['Mock.1']) atr1 = float(row['ATRi.1']) mock2 = float(row['Mock.2']) atr2 = float(row['ATRi.2']) mock3 = float(row['Mock.3']) atr3 = float(row['ATRi.3']) mock4 = float(row['Mock.4']) atr4 = float(row['ATRi.4']) d1 = atr1 - mock1 d2 = atr2 - mock2 d3 = atr3 - mock3 d4 = atr4 - mock4 # We demand that at least three replicates show SL a = np.array([d1, d2, d3, d4]) mn = a.mean() if mn < -2: SL = True elif mn >= 0: SL = False else: raise ValueError("Expecting mean either below -2 or above 0") sli = SyntheticLethalInteraction(gene_A_symbol=geneA, gene_A_id=geneAid, gene_B_symbol=geneB, gene_B_id=geneBid, gene_A_pert=SlConstants.PHARMACEUTICAL, gene_B_pert=SlConstants.SI_RNA, effect_type=SlConstants.ZSCORE, effect_size=mn, cell_line=SlConstants.U2OS_CELL, cellosaurus_id=SlConstants.U2OS_CELLOSAURUS, cancer_type='n/a', ncit_id='n/a', assay=SlConstants.RNA_INTERFERENCE_ASSAY, pmid=self.pmid, SL=SL) gene_pair = GenePair(geneA, geneB) sli_dict[gene_pair].append(sli) sli_list = self._mark_maximum_entries(sli_dict) return sli_list