예제 #1
0
 def parse(self):
     gene1_perturbation = SlConstants.PHARMACEUTICAL
     gene2_perturbation = 'natural (is a TSG)'
     assay = "pharmaceutical + siRNA"
     # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair
     sli_dict = defaultdict(list)
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             if len(row) < 4:
                 raise ValueError(
                     "Only got %d fields but was expecting at least 4 tab-separated fields"
                     % len(row))
             # seperate col containing multiple genes
             geneA_sym = row['geneAlist'].split(",")
             geneB_sym = row['geneB']
             geneB_sym = self.get_current_symbol(geneB_sym)
             if geneB_sym in self.get_current_symbol(geneB_sym):
                 geneB_id = "NCBIGene:{}".format(
                     self.entrez_dict.get(geneB_sym))
             else:
                 raise ValueError(
                     "Could not find id for geneB %s in Srivasa 2016" %
                     geneB_sym)
             effect = float(row['effect'].replace(",", "."))
             for i in geneA_sym:
                 i = self.get_current_symbol(i)
                 if i in self.entrez_dict:
                     geneA_id = "NCBIGene:{}".format(
                         self.entrez_dict.get(i))
                 else:
                     raise ValueError(
                         "Could not find id for geneA %s in Srivasa 2016" %
                         i)
                 if geneA_id == geneB_id:
                     continue  # There are a few self loops in the data, but these are not SLIs, so we skip them
                 sli = SyntheticLethalInteraction(
                     gene_A_symbol=i,
                     gene_A_id=geneA_id,
                     gene_B_symbol=geneB_sym,
                     gene_B_id=geneB_id,
                     gene_A_pert=gene1_perturbation,
                     gene_B_pert=gene2_perturbation,
                     effect_type=SlConstants.ZSCORE,
                     effect_size=effect,
                     cell_line=SlConstants.HELA_CELL,
                     cellosaurus_id=SlConstants.HELA_CELLOSAURUS,
                     cancer_type=SlConstants.N_A,
                     ncit_id=SlConstants.N_A,
                     assay=assay,
                     pmid=self.pmid,
                     SL=True)
                 gene_pair = GenePair(i, geneB_sym)
                 sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
 def parse(self):
     myc = 'MYC'
     myc_id = self.get_ncbigene_curie(myc)
     myc_perturbation = SlConstants.OVEREXPRESSION
     geneB_perturbation = SlConstants.SI_RNA
     assay_string = SlConstants.RNA_INTERFERENCE_ASSAY
     effect_type = SlConstants.LOG2_DECREASE_IN_ABUNDANCE
     cell_line = 'human mammary epithelial cells'
     cellosaurus = SlConstants.N_A
     cancer = SlConstants.N_A
     ncit = SlConstants.N_A
     sli_dict = defaultdict(list)
     # Pseudogenes, divergent nc transcripts
     # DIP maps to two newer symbols (also GIF
     unclear_gene_symbols = {
         'ATP5EP1', 'C10orf111', 'C19ORF30', 'C3ORF51', 'CG030', 'CLEC4GP1',
         'CSN1S2A', 'DIP', 'DKFZP434I0714', 'DVL1L1', 'FLJ20674',
         'FLJ22447', 'GIF', 'HCG27', 'HMG14P', 'IGLV@', 'LDHBP', 'OR5D2P',
         'RBMXP1', 'RPL19P1'
     }
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             if len(row) != 3:
                 raise ValueError("Bad row with %d fields: %s" %
                                  (len(row), row))
             geneBsym = row['symbol']
             geneBsym = self.get_current_symbol(geneBsym)
             if geneBsym in self.entrez_dict:
                 geneB_id = self.get_ncbigene_curie(geneBsym)
             elif geneBsym in unclear_gene_symbols:
                 continue
             else:
                 raise ValueError(
                     "Could not find id for %s in Kessler 2012 " % geneBsym)
             medianDiffs = float(row['median.pair.diffs'])
             sli = SyntheticLethalInteraction(
                 gene_A_symbol=myc,
                 gene_A_id=myc_id,
                 gene_B_symbol=geneBsym,
                 gene_B_id=geneB_id,
                 gene_A_pert=myc_perturbation,
                 gene_B_pert=geneB_perturbation,
                 effect_type=effect_type,
                 effect_size=medianDiffs,
                 cell_line=cell_line,
                 cellosaurus_id=cellosaurus,
                 cancer_type=cancer,
                 ncit_id=ncit,
                 assay=assay_string,
                 pmid=self.pmid,
                 SL=True)
             gene_pair = GenePair(myc, geneBsym)
             sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
 def parse(self):
     geneA_symbol = 'CHEK1'
     geneA_id = 'NCBIGene:1111'
     geneA_perturbation = SlConstants.PHARMACEUTICAL
     gene2_perturbation = SlConstants.SI_RNA
     assay = SlConstants.RNA_INTERFERENCE_ASSAY
     effect_type = SlConstants.ZSCORE
     cell_line = "HeLa-Cells"
     cellosaurus = "CVCL_0030"
     cancer = ""
     ncit = ""  #
     # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair
     sli_dict = defaultdict(list)
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         # Z-Score	Symbol	Entrez ID	Gene Name
         for row in csvreader:
             if len(row) < 3:
                 raise ValueError("Only got %d fields but was expecting at least 3" % len(row))
             geneB_sym = row['Symbol']
             geneB_sym = self.get_current_symbol(geneB_sym)
             if geneB_sym == 'CHEK1':
                 continue  # Do not allow self-loops!
             if geneB_sym in self.entrez_dict:
                 geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneB_sym))
             else:
                 raise ValueError("Could not find id for gene symbol %s in Shen 2015" % geneB_sym)
             effect = float(row['Z-Score'].replace(",", "."))
             sl_genes = ["FZR1", "RAD17", "RFC1", "BLM", "CDC73", "CDC6", "WEE1"]
             if geneB_sym in sl_genes:
                 SL = True
             else:
                 SL = False
             sli = SyntheticLethalInteraction(gene_A_symbol=geneA_symbol,
                                              gene_A_id=geneA_id,
                                              gene_B_symbol=geneB_sym,
                                              gene_B_id=geneB_id,
                                              gene_A_pert=geneA_perturbation,
                                              gene_B_pert=gene2_perturbation,
                                              effect_type=effect_type,
                                              effect_size=effect,
                                              cell_line=cell_line,
                                              cellosaurus_id=cellosaurus,
                                              cancer_type=cancer,
                                              ncit_id=ncit,
                                              assay=assay,
                                              pmid=self.pmid,
                                              SL=SL)
             gene_pair = GenePair(geneA_symbol, geneB_sym)
             sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
 def parse(self):
     perturbation = SlConstants.KNOCKOUT
     cellosuarus = SlConstants.HAP1_CELLOSAURUS
     assay = 'proportions.of.sense.and.antisense.insertions'
     sli_dict = defaultdict(list)
     # GENE	SUMMARY	PUBMED ID	INTERACTING QUERY GENE
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             geneA = self.get_current_symbol(row['GENE'])
             if geneA in self.entrez_dict:
                 geneA_id = "NCBIGene:{}".format(
                     self.entrez_dict.get(geneA))
             else:
                 raise ValueError(
                     "[ERROR] We could not find a gene id for " + geneA)
             geneBlist = row['INTERACTING QUERY GENE']
             for geneB in geneBlist.split(';'):
                 geneB = geneB.strip()
                 geneB = self.get_current_symbol(geneB)
                 if geneB in self.entrez_dict:
                     geneB_id = "NCBIGene:{}".format(
                         self.entrez_dict.get(geneB))
                 else:
                     raise ValueError(
                         "Could not get NCBI id for gene \"%s\" in Blomen 2015"
                         % geneB)
                 sli = SyntheticLethalInteraction(
                     gene_A_symbol=geneA,
                     gene_A_id=geneA_id,
                     gene_B_symbol=geneB,
                     gene_B_id=geneB_id,
                     gene_A_pert=perturbation,
                     gene_B_pert=perturbation,
                     effect_type=SlConstants.N_A,
                     effect_size=0,
                     cell_line=SlConstants.HAP1_CELL,
                     cellosaurus_id=cellosuarus,
                     cancer_type=SlConstants.N_A,
                     ncit_id=SlConstants.N_A,
                     assay=assay,
                     pmid=self.pmid,
                     SL=True)
                 gene_pair = GenePair(geneA, geneB)
                 sli_dict[gene_pair].append(sli)
                 sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
예제 #5
0
    def parse_suppl10_11(self, fname):
        rb1 = 'RB1'
        rb1_id = SlConstants.RB1_GENE_ID
        rb1_perturbation = SlConstants.LOF_MUTATION
        gene2_perturbation = SlConstants.SI_RNA
        assay_string = "siMEM+penetrance"
        effect_type = "penetrance"
        cell_line = SlConstants.N_A
        cellosaurus = SlConstants.N_A
        cancer = SlConstants.N_A
        ncit = SlConstants.N_A

        with open(fname) as csvfile:
            csvreader = csv.DictReader(csvfile, delimiter='\t')
            for row in csvreader:
                # print(row)
                geneBsym = self.get_current_symbol(row['target'])
                if ',' in geneBsym:
                    continue # We cannot assign an effect unambiguously to one of the genes
                    # some of the entries are like  PMS2,PMS2CL
                if geneBsym in self.entrez_dict:
                    geneB_id = self.get_ncbigene_curie(geneBsym)
                elif geneBsym in self.unclear_gene_symbols:
                    continue
                else:
                    raise ValueError("Could not find id for %s in Brough 2018 2008 " % geneBsym)
                penetrance = int(row['Penetrance.(%)'])
                if penetrance >= 80:
                    sli = SyntheticLethalInteraction(gene_A_symbol=rb1,
                                                     gene_A_id=rb1_id,
                                                     gene_B_symbol=geneBsym,
                                                     gene_B_id=geneB_id,
                                                     gene_A_pert=rb1_perturbation,
                                                     gene_B_pert=gene2_perturbation,
                                                     effect_type=effect_type,
                                                     effect_size=penetrance,
                                                     cell_line=cell_line,
                                                     cellosaurus_id=cellosaurus,
                                                     cancer_type=cancer,
                                                     ncit_id=ncit,
                                                     assay=assay_string,
                                                     pmid=self.pmid,
                                                     SL=True)
                    gene_pair = GenePair(rb1, geneBsym)
                    self.sli_dict[gene_pair].append(sli)
예제 #6
0
    def parse_suppl9(self):
        fname = 'data/brough_2012_suppl9.tsv'
        rb1 = 'RB1'
        rb1_id = SlConstants.RB1_GENE_ID
        rb1_perturbation = SlConstants.LOF_MUTATION
        gene2_perturbation = SlConstants.SI_RNA
        assay_string = "siMEM+penetrance"
        effect_type = "penetrance"
        cell_line = SlConstants.N_A
        cellosaurus = SlConstants.N_A
        cancer = SlConstants.N_A
        ncit = SlConstants.N_A

        with open(fname) as csvfile:
            csvreader = csv.DictReader(csvfile, delimiter='\t')
            for row in csvreader:
                # print(row)
                geneBsym = self.get_current_symbol(row['symbol'])
                if geneBsym in self.entrez_dict:
                    geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneBsym))
                elif geneBsym in self.unclear_gene_symbols:
                    continue
                else:
                    raise ValueError("Could not find iid for %s in Brough 2018 2008 " % geneBsym)
                penetrance = int(row['Penetrance.%'])
                if penetrance >= 80:
                    sli = SyntheticLethalInteraction(gene_A_symbol=rb1,
                                                     gene_A_id=rb1_id,
                                                     gene_B_symbol=geneBsym,
                                                     gene_B_id=geneB_id,
                                                     gene_A_pert=rb1_perturbation,
                                                     gene_B_pert=gene2_perturbation,
                                                     effect_type=effect_type,
                                                     effect_size=penetrance,
                                                     cell_line=cell_line,
                                                     cellosaurus_id=cellosaurus,
                                                     cancer_type=cancer,
                                                     ncit_id=ncit,
                                                     assay=assay_string,
                                                     pmid=self.pmid,
                                                     SL=True)
                    gene_pair = GenePair(rb1, geneBsym)
                    self.sli_dict[gene_pair].append(sli)
 def parse(self):
     parp1_symbol = 'PARP1'
     parp1_id = 'NCBIGene:142'
     parp1_perturbation = SlConstants.PHARMACEUTICAL
     gene2_perturbation = SlConstants.SI_RNA
     assays = ['competitive hybridization', 'multicolor competition assay']
     assay_string = ";".join(assays)
     effect_type = 'stddev'
     cell_line = 'CAL-51'
     cellosaurus = 'CVCL_1110'
     cancer = "Breast Carcinoma"
     ncit = "NCIT:C4872"
     sli_dict = defaultdict(list)
     with open(self.fname) as csvfile:
         # SMARTpool	Z score	percent-siCONTROL
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             if len(row) < 3:
                 raise ValueError(
                     "Bad row for Turner et al, only %d fields found (%s)" %
                     (len(row), row))
             geneB_sym = self.get_current_symbol(row['SMARTpool'])
             zscore = float(row['Z score'])
             if geneB_sym in self.entrez_dict:
                 geneB_id = "NCBIGene:{}".format(
                     self.entrez_dict.get(geneB_sym))
             elif geneB_sym == 'IMPK':
                 continue  # could not be found in HGNC or NCBI Gene
             elif geneB_sym == 'FLJ34389':
                 geneB_sym = 'MLKL'
                 geneB_id = "NCBIGene:197259"
             else:
                 if zscore > 3.0:
                     raise ValueError(
                         "Could not get NCBI id for gene %s in Turner 2008"
                         % geneB_sym)
                 else:
                     continue  # These are negative examples, we will just skiip
             if zscore <= -3.0:
                 SL = True
             else:
                 SL = False
             sli = SyntheticLethalInteraction(
                 gene_A_symbol=parp1_symbol,
                 gene_A_id=parp1_id,
                 gene_B_symbol=geneB_sym,
                 gene_B_id=geneB_id,
                 gene_A_pert=parp1_perturbation,
                 gene_B_pert=gene2_perturbation,
                 effect_type=effect_type,
                 effect_size=zscore,
                 cell_line=cell_line,
                 cellosaurus_id=cellosaurus,
                 cancer_type=cancer,
                 ncit_id=ncit,
                 assay=assay_string,
                 pmid=self.pmid,
                 SL=SL)
             gene_pair = GenePair(parp1_symbol, geneB_sym)
             sli_dict[gene_pair].append(sli)
             sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
예제 #8
0
    def parse(self):
        kras_symbol = 'KRAS'
        kras_id = 'NCBIGene:3845'
        kras_perturbation = SlConstants.ACTIVATING_MUTATION
        gene2_perturbation = 'shRNA'
        assays = ['competitive hybridization', 'multicolor competition assay']
        assay_string = ";".join(assays)
        effect_type = 'stddev'
        cell_line = "DLD-1"
        cellosaurus = "CVCL_0248"
        cancer = "Colorectal Carcinoma"
        ncit = "NCIT:C2955"

        # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair
        # Symbol	Accession	v2SH	Sequence	Mean.DLD1	SD.DLD1	Mean.HCT116	SD.HCT116
        sli_dict = defaultdict(list)
        with open(self.fname) as csvfile:
            csvreader = csv.DictReader(csvfile, delimiter='\t')
            for row in csvreader:
                if len(row) < 8:
                    raise ValueError(
                        "Bad line in Luo2009 with less than 8 fields")
                geneB_sym = self.get_current_symbol(row['Symbol'])
                if geneB_sym == 'CXORF40A':
                    geneB_sym = 'EOLA1'
                if geneB_sym in self.entrez_dict:
                    geneB_id = "NCBIGene:{}".format(
                        self.entrez_dict.get(geneB_sym))
                elif geneB_sym == 'FLJ34747':
                    # This is a LINC, plus the symbol is old
                    geneB_sym = 'LINC00547'
                    geneB_id = 'NCBIGene:400121'
                elif geneB_sym == 'LOC283194':
                    geneB_id = 'NCBIGene:283194'  # an ncRNA
                elif geneB_sym == 'LOC285556':
                    geneB_id = 'NCBIGene:285556'
                elif geneB_sym == 'LOC149654' or geneB_sym == 'LOC730000':
                    continue  # Could not find these in NCBI Gene or HCNG
                else:
                    raise ValueError(
                        "Could not get NCBI id for gene %s in Luo2009" %
                        geneB_sym)
                stddev = float(row['SD.DLD1'])  # float(fields[5])
                SL = True  # All data in this set is True # TODO CHECK
                sli = SyntheticLethalInteraction(
                    gene_A_symbol=kras_symbol,
                    gene_A_id=kras_id,
                    gene_B_symbol=geneB_sym,
                    gene_B_id=geneB_id,
                    gene_A_pert=kras_perturbation,
                    gene_B_pert=gene2_perturbation,
                    effect_type=effect_type,
                    effect_size=stddev,
                    cell_line=cell_line,
                    cellosaurus_id=cellosaurus,
                    cancer_type=cancer,
                    ncit_id=ncit,
                    assay=assay_string,
                    pmid=self.pmid,
                    SL=SL)
                gene_pair = GenePair(kras_symbol, geneB_sym)
                sli_dict[gene_pair].append(sli)
        sli_list = self._mark_maximum_entries(sli_dict)
        return sli_list
 def parse(self):
     kras_symbol = 'KRAS'
     kras_id = SlConstants.KRAS_GENE_ID
     kras_perturbation = SlConstants.ACTIVATING_MUTATION  # activating_mutation
     gene2_perturbation = SlConstants.SI_RNA  # 'siRNA'
     assay_string = SlConstants.RNA_INTERFERENCE_ASSAY
     effect_type = 'stddev'
     cell_line = SlConstants.HCT_116
     cellosaurus = SlConstants.HCT_116_CELLOSAURUS
     cancer = SlConstants.COLORECTAL_CARCINOMA
     ncit = SlConstants.COLORECTAL_CARCINOMA_NCIT
     sli_dict = defaultdict(list)
     # Immunoglobulin or multiple mapping old symbols
     # COAS3, CES4, POM121L1, MYCL2 are aliases for a pseudogene
     unclear_gene_symbols = {
         'MAD', 'IGHG4', 'DKFZp434C1418', 'COAS3', 'HNT', 'CES4', 'SAS',
         'HLA-DRB3', 'LOC90557', 'POM121L1', 'MLL2', '37499', 'MYCL2',
         'CAMKIINALPHA', 'TGIF', 'PCDHA2', 'PCDHA9'
     }
     # GeneID	Locus.ID	Accession	HCT-116.Z-score	HKE-3.Z-score	D.Z-score
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             if len(row) != 6:
                 raise ValueError("Line has %d fields (should have 6): %s" %
                                  (len(row), row))
             geneB_sym = row['GeneID']  # F[0]
             geneB_sym = self.get_current_symbol(geneB_sym)
             locusID = row['Locus.ID']  # F[1]
             accession = row['Accession']  # F[2]
             HCT116_zscore = float(row['HCT-116.Z-score'])  # float(F[3])
             HKE3_zscore = float(row['HKE-3.Z-score'])  # float(F[4])
             delta_zscore = float(row['D.Z-score'])
             if geneB_sym in self.entrez_dict:
                 geneB_id = "NCBIGene:{}".format(
                     self.entrez_dict.get(geneB_sym))
             elif geneB_sym == 'C9ORF96':
                 geneB_sym = 'STKLD1'
             elif geneB_sym in unclear_gene_symbols:
                 continue
             elif delta_zscore < 2:
                 continue  # one of the many negative samples, we can skip it if it cannot be mapped
             else:
                 raise ValueError(
                     "Could not find id for gene %s in Steckel 2012" %
                     geneB_sym)
             if geneB_sym == "KRAS":
                 continue  # This was an internal control!
             if delta_zscore >= 3.3 and HKE3_zscore < 2:
                 SL = True
             else:
                 SL = False
             sli = SyntheticLethalInteraction(
                 gene_A_symbol=kras_symbol,
                 gene_A_id=kras_id,
                 gene_B_symbol=geneB_sym,
                 gene_B_id=geneB_id,
                 gene_A_pert=kras_perturbation,
                 gene_B_pert=gene2_perturbation,
                 effect_type=effect_type,
                 effect_size=HCT116_zscore,
                 cell_line=cell_line,
                 cellosaurus_id=cellosaurus,
                 cancer_type=cancer,
                 ncit_id=ncit,
                 assay=assay_string,
                 pmid=self.pmid,
                 SL=SL)
             gene_pair = GenePair(kras_symbol, geneB_sym)
             sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
 def parse(self):
     # because of the experiment, geneA is always VHL.
     vhl_symbol = 'VHL'
     vhl_id = SlConstants.VHL_GENE_ID
     vhl_perturbation = SlConstants.LOF_MUTATION
     gene2_perturbation = SlConstants.SH_RNA
     assays = [SlConstants.COMPETITIVE_HYBRIDIZATION, SlConstants.MULTICOLOR_COMPETITION_ASSAY]
     effect_type = 'differential_viability'
     cell_786O = "786-0"
     cellosaurus_786O = "CVCL_1051"
     cell_RCC4 = "RCC4"
     cellosaurus_RCC4 = "CVCL_0498"
     # The following keeps track of the current largest effect size SLI for any given gene A/gene B pair
     sli_dict = defaultdict(list)
     # The following list includes symbols that are not current but either could
     # not be matched or match to multiple possible candidates
     unclear_gene_symbols = {'PITSLRE', 'TAK1', 'PKD3', 'CAMLCK', 'MAPAPK3', 'CK1E', 'CK2A2', 'PDGRFB', 'ZC1/HGK'}
     # gene	differential	cell	table
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             if len(row) < 4:
                 raise ValueError("Only got %d fields but was expecting 4" % len(row))
             genesy = row['gene'].upper()
             geneB_sym = self.get_current_symbol(genesy)
             if geneB_sym == "IRR" or geneB_sym == "HER4":
                 continue
             if geneB_sym in unclear_gene_symbols:
                 continue  # Symbol could be either CDK11A or CDK11B
             if geneB_sym in self.entrez_dict:
                 geneB_id = "NCBIGene:{}".format(self.entrez_dict.get(geneB_sym))
             else:
                 raise ValueError("Could not find id for %s in Bommi 2008" % geneB_sym)
             effect = float(row['differential'])
             cell = row['cell']
             if cell == 'RCC4':
                 cell_line = cell_RCC4
                 cellosaurus = cellosaurus_RCC4
             elif cell == '786-0':
                 cell_line = cell_786O
                 cellosaurus = cellosaurus_786O
             else:
                 raise ValueError("Did not recognize cell type '%s'" % cell)
             table = row['table']
             assay_string = "differential viability assay {}({})".format(cell, table)
             SL = True  # All data in this set is True # TODO CHECK
             sli = SyntheticLethalInteraction(gene_A_symbol=vhl_symbol,
                                              gene_A_id=vhl_id,
                                              gene_B_symbol=geneB_sym,
                                              gene_B_id=geneB_id,
                                              gene_A_pert=vhl_perturbation,
                                              gene_B_pert=gene2_perturbation,
                                              effect_type=effect_type,
                                              effect_size=effect,
                                              cell_line=cell_line,
                                              cellosaurus_id=cellosaurus,
                                              cancer_type=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA,
                                              ncit_id=SlConstants.CLEAR_CELL_RENAL_CELL_CARCINOMA_NCIT,
                                              assay=assay_string,
                                              pmid=self.pmid,
                                              SL=SL)
             gene_pair = GenePair(vhl_symbol, geneB_sym)
             sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list
 def parse(self):
     geneA = 'ATR'
     geneAid = 'NCBIGene:545'
     sli_dict = defaultdict(list)
     with open(self.fname) as csvfile:
         csvreader = csv.DictReader(csvfile, delimiter='\t')
         for row in csvreader:
             geneB = row['Gene Symbol']
             geneB = self.get_current_symbol(geneB)
             if geneB == 'ATR':
                 continue # Self interaction, not a SLI!
             # A few special cases -- capitalization is not correct in the HGNC file
             if geneB == 'C10ORF119':
                 geneB = 'MCMBP'
             elif geneB == 'C15ORF20':
                 geneB = 'PIF1'
             elif geneB == 'CXORF53':
                 geneB = 'BRCC3'
             if geneB in self.entrez_dict:
                 geneBid = self.get_ncbigene_curie(geneB)
             else:
                 raise ValueError("Could not find id for gene %s in Mohni 2014" % geneB)
             mock1 = float(row['Mock.1'])
             atr1 = float(row['ATRi.1'])
             mock2 = float(row['Mock.2'])
             atr2 = float(row['ATRi.2'])
             mock3 = float(row['Mock.3'])
             atr3 = float(row['ATRi.3'])
             mock4 = float(row['Mock.4'])
             atr4 = float(row['ATRi.4'])
             d1 = atr1 - mock1
             d2 = atr2 - mock2
             d3 = atr3 - mock3
             d4 = atr4 - mock4
             # We demand that at least three replicates show SL
             a = np.array([d1, d2, d3, d4])
             mn = a.mean()
             if mn < -2:
                 SL = True
             elif mn >= 0:
                 SL = False
             else:
                 raise ValueError("Expecting mean either below -2 or above 0")
             sli = SyntheticLethalInteraction(gene_A_symbol=geneA,
                                              gene_A_id=geneAid,
                                              gene_B_symbol=geneB,
                                              gene_B_id=geneBid,
                                              gene_A_pert=SlConstants.PHARMACEUTICAL,
                                              gene_B_pert=SlConstants.SI_RNA,
                                              effect_type=SlConstants.ZSCORE,
                                              effect_size=mn,
                                              cell_line=SlConstants.U2OS_CELL,
                                              cellosaurus_id=SlConstants.U2OS_CELLOSAURUS,
                                              cancer_type='n/a',
                                              ncit_id='n/a',
                                              assay=SlConstants.RNA_INTERFERENCE_ASSAY,
                                              pmid=self.pmid,
                                              SL=SL)
             gene_pair = GenePair(geneA, geneB)
             sli_dict[gene_pair].append(sli)
     sli_list = self._mark_maximum_entries(sli_dict)
     return sli_list