示例#1
0
 def execute( self ):
     
     # Get the option values
     self.DBPath = OptionManager.get_instance().get_option( OptionConstants.OPTION_DB_NAME )
     query_file_path = OptionManager.get_instance().get_option( OptionConstants.OPTION_QUERY_FILE )
     species = OptionManager.get_instance().get_option( OptionConstants.OPTION_SPECIES )
     
     # Check if a species has been provided
     if species == None or len( species) == 0:
         raise RainetException( "InteractiveQueryStrategy.execute: You must specify a species in your command line. Please check help.")
     
     # Build a SQL session to DB
     SQLManager.get_instance().set_DBpath( self.DBPath)
     sql_session = SQLManager.get_instance().get_session()
     
     # Check if the species in the DB correspond to the species given by the user
     SQLManager.check_species(sql_session, species, True)
     
     # Read the query from file
     Logger.get_instance().info("InteractiveQueryStrategy.execute : Reading query...")
     query_string = self.read_query( query_file_path)
     
     # Get the result of the query on DB
     Logger.get_instance().info("InteractiveQueryStrategy.execute : Querying database...")
     query_result = self.perform_query( sql_session, query_string)
     
     # Export query result to file
     Logger.get_instance().info("InteractiveQueryStrategy.execute : Exporting result...")
     self.export_query_result( query_result, query_file_path)
     
     Logger.get_instance().info("InteractiveQueryStrategy.execute : Finished.")
示例#2
0
    def execute(self):

        self.DBPath = OptionManager.get_instance().get_option(
            OptionConstants.OPTION_DB_NAME)
        self.forceOverride = OptionManager.get_instance().get_option(
            OptionConstants.OPTION_INSERTION_FORCE_OVERRIDE)
        self.insert_data()
示例#3
0
    def setUp(self):

        # Set the options
        # Note: if running from command line / main script, the optionManager gets the default values,
        # but in unittest we must set up all arguments, whether optional or not.
        # In the actual unittests I may override the default options, for testing.
        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_VERBOSITY, "debug")
        optionManager.set_option(
            OptionConstants.OPTION_DB_NAME,
            "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet_testing_DB.sqlite"
        )  #rainet2016-06-17.human_expression_wPRI.sqlite")
        optionManager.set_option(OptionConstants.OPTION_SPECIES, "human")
        optionManager.set_option(
            OptionConstants.OPTION_OUTPUT_FOLDER,
            "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_results/enrichmentAnalysis/"
        )
        optionManager.set_option(OptionConstants.OPTION_ANNOTATION_TABLE,
                                 "NetworkModule")
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_PROTEIN_ANNOTATION,
            OptionConstants.DEFAULT_MINIMUM_PROTEIN_ANNOTATION)
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_PROTEIN_INTERACTION,
            OptionConstants.DEFAULT_MINIMUM_PROTEIN_INTERACTION)
        optionManager.set_option(OptionConstants.OPTION_NUMBER_RANDOMIZATIONS,
                                 OptionConstants.DEFAULT_NUMBER_RANDOMIZATIONS)
        optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING,
                                 OptionConstants.DEFAULT_EXPRESSION_WARNING)
        optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION,
                                 OptionConstants.DEFAULT_MINIMUM_EXPRESSION)
        optionManager.set_option(OptionConstants.OPTION_LOWER_TAIL,
                                 OptionConstants.DEFAULT_LOWER_TAIL)

        # Set the level of verbosity
        Logger.get_instance().set_level(
            OptionManager.get_instance().get_option(
                OptionConstants.OPTION_VERBOSITY))

        # Setting up SQL manager
        SQLManager.get_instance().set_DBpath(
            OptionManager.get_instance().get_option(
                OptionConstants.OPTION_DB_NAME))
        self.sql_session = SQLManager.get_instance().get_session()

        # setting up internal test folder paths
        self.expectedFolder = "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/enrichmentAnalysis/test_expected"
        self.outputFolder = OptionManager.get_instance().get_option(
            OptionConstants.OPTION_OUTPUT_FOLDER)

        # create instance of strategy
        self.run = EnrichmentAnalysisStrategy()
        self.run.sql_session = SQLManager.get_instance().get_session()
示例#4
0
    def setUp(self):

        # Set the options
        # Note: if running from command line / main script, the optionManager gets the default values,
        # but in unittest we must set up all arguments, whether optional or not.
        # In the actual unittests I may override the default options, for testing.
        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_VERBOSITY, "debug")
        #        optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite")
        optionManager.set_option(OptionConstants.OPTION_DB_NAME, DB_PATH)
        optionManager.set_option(OptionConstants.OPTION_SPECIES, "human")
        optionManager.set_option(
            OptionConstants.OPTION_OUTPUT_FOLDER,
            "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_results/"
        )
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE,
            OptionConstants.DEFAULT_INTERACTION_SCORE)
        optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES,
                                 OptionConstants.DEFAULT_RNA_BIOTYPES)
        optionManager.set_option(OptionConstants.OPTION_GENCODE,
                                 OptionConstants.DEFAULT_GENCODE)
        optionManager.set_option(
            OptionConstants.OPTION_EXPRESSION_VALUE_CUTOFF,
            OptionConstants.DEFAULT_EXPRESSION_VALUE_CUTOFF)
        optionManager.set_option(
            OptionConstants.OPTION_EXPRESSION_TISSUE_CUTOFF,
            OptionConstants.DEFAULT_EXPRESSION_TISSUE_CUTOFF)
        optionManager.set_option(OptionConstants.OPTION_LOW_MEMORY,
                                 OptionConstants.DEFAULT_LOW_MEMORY)

        # Set the level of verbosity
        Logger.get_instance().set_level(
            OptionManager.get_instance().get_option(
                OptionConstants.OPTION_VERBOSITY))

        # Setting up SQL manager
        SQLManager.get_instance().set_DBpath(DB_PATH)
        self.sql_session = SQLManager.get_instance().get_session()

        # setting up internal test folder paths
        self.expectedFolder = "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_expected/Report"
        self.outputFolder = OptionManager.get_instance().get_option(
            OptionConstants.OPTION_OUTPUT_FOLDER) + "/Report/"

        # create instance of strategy
        self.strategy = AnalysisStrategy()

        # report only written for selected tests
        self.strategy.writeReportFile = 0
示例#5
0
    def test_RNA_filter_two(self):

        print "| test_RNA_filter_two | "

        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES,
                                 "lincRNA")
        self.strategy.execute()

        lincRNAs = DataManager.get_instance().get_data(
            AnalysisStrategy.RNA_FILTER_KW)

        self.assertTrue(
            len(lincRNAs) == 9,
            "asserting if number of objects retrieved is correct")

        for lincRNA in lincRNAs:
            self.assertTrue(
                isinstance(lincRNA, LncRNA),
                "check if the lncRNA is instance of LncRNA table/class")

        response = self.sql_session.query(LncRNA).filter(
            LncRNA.transcriptBiotype == "lincRNA").all()

        self.assertTrue(
            len(lincRNAs) == len(response),
            "asserting if number of objects in lncRNA default (off) option is same as querying directly lncRNA table"
        )
示例#6
0
    def test_annotation_report(self):

        print "| test_annotation_report | "

        self.run.execute()

        # For NetworkModule

        # Background should be overlap between interactingProteins and Proteins with annotation

        #SELECT count(distinct( InteractingProtein.uniprotAC)) FROM InteractingProtein, ProteinNetworkModule WHERE  InteractingProtein.uniprotAC == ProteinNetworkModule.protein_id
        self.assertTrue(
            len(self.run.backgroundProteins) == 3,
            "assert number of background proteins is correctly calculated")
        #self.assertTrue( len( self.run.backgroundProteins) == 37, "assert number of background proteins is correctly calculated")

        #SELECT count(distinct(ProteinNetworkModule.protein_id)) FROM ProteinNetworkModule -> 42
        self.assertTrue(
            self.run.protAnnotDictLen == 42,
            "assert number of proteins with annotations is correctly calculated"
        )

        self.assertTrue(
            self.run.allProteinsWithInteractionDataLen == len(
                DataManager.get_instance().get_data(
                    EnrichmentAnalysisStrategy.PRI_PROT_KW)),
            "assert number of proteins with interactions is correctly calculated"
        )

        #SELECT count(distinct(ProteinNetworkModule.networkModule_id)) FROM ProteinNetworkModule -> 82. But this does not count interactions.
        pool = {
            prot
            for annot in self.run.annotWithInteractionDict
            for prot in self.run.annotWithInteractionDict[annot]
        }

        #self.assertTrue( len( pool) == len( self.run.backgroundProteins), "confirm number of proteins with annotation and interaction")

        # For KEGG Pathway
        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_ANNOTATION_TABLE,
                                 "KEGGPathway")

        self.run.execute()

        #SELECT count(distinct(ProteinKEGGAnnotation.protein_id)) FROM ProteinKEGGAnnotation -> 22
        self.assertTrue(
            self.run.protAnnotDictLen == 22,
            "assert number of proteins with annotations is correctly calculated"
        )

        self.assertTrue(
            self.run.allProteinsWithInteractionDataLen == len(
                DataManager.get_instance().get_data(
                    EnrichmentAnalysisStrategy.PRI_PROT_KW)),
            "assert number of proteins with interactions is correctly calculated"
        )
示例#7
0
    def test_RNA_filter_four(self):

        print "| test_RNA_filter_four | "

        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_GENCODE, "1")
        self.strategy.execute()

        RNAs = DataManager.get_instance().get_data(
            AnalysisStrategy.RNA_FILTER_KW)

        self.assertTrue(
            len(RNAs) == 70,
            "asserting if number of object with gencode is correct")
示例#8
0
    def test_PRI_filter_one(self):

        print "| test_PRI_filter_one | "

        optionManager = OptionManager.get_instance()
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, "10.0")
        self.strategy.execute()

        PRIs = DataManager.get_instance().get_data(
            AnalysisStrategy.PRI_FILTER_KW)

        # Regarding peptide redundancy filter, there is two of such cases in test database, one is misc_RNA (is always filtered) and the other is lincRNA, so only one peptide (interaction) is removed

        self.assertTrue(
            len(PRIs) == 9,
            "asserting if number of interactions above certain interaction score is correct"
        )
示例#9
0
    def test_retrieve_expression(self):

        print "| test_retrieve_expression | "

        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING,
                                 0.8)
        optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION,
                                 1.0)

        self.run.execute()

        self.run.retrieve_expression()

        rna_id = "ENST00000421534"

        self.assertTrue("P62826" in self.run.protTissueExpressions)
        self.assertTrue(len(self.run.protTissueExpressions["P62826"]) == 49)
        self.assertTrue(len(self.run.expressionDict[rna_id]) == 2)
示例#10
0
    def build_database(self, path, force_override):

        # Get the value of database species specified by the user
        species = OptionManager.get_instance().get_option(
            OptionConstants.OPTION_SPECIES)
        if species == None or len(species) == 0:
            raise RainetException(
                "SQLManager.build_database: You must specify a species in your command line. Please check help."
            )

        # Remove the database file if required
        if force_override:
            self.remove_database_file(path)

        # Create engine to dedicated database
        engine = self.create_engine(path)

        # Open the DB session
        session = sessionmaker()
        session.configure(bind=engine)

        # Look if the Protein table exists. If not, it means DB model was not created
        model_exists = True
        try:
            table_content = session().query(DataConstants.PROTEIN_CLASS).all()
            if table_content == None:
                model_exists = False
        except Exception:
            model_exists = False

        # Create all the required table in DB according to class model
        # if required (override mode or model does not exist in DB)
        if force_override or not model_exists:
            Base.metadata.create_all(engine)
            sql_session = session()
            SQLManager.check_species(sql_session, species, True)
            sql_session.close()

        # Keep the DB path
        self.DBPath = path

        Logger.get_instance().info('Database File created/used : ' + str(path))
示例#11
0
    def test_RNA_filter_one(self):

        print "| test_RNA_filter_one | "

        optionManager = OptionManager.get_instance()
        optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES,
                                 "protein_coding")

        self.strategy.execute()

        mRNAs = DataManager.get_instance().get_data(
            AnalysisStrategy.RNA_FILTER_KW)

        self.assertTrue(
            len(mRNAs) == 82,
            "asserting if number of objects retrieved is correct")

        for mRNA in mRNAs:
            self.assertTrue(
                isinstance(mRNA, MRNA),
                "check if the mRNA is instance of MRNA table/class")
示例#12
0
    def execute(self):

        strategy_command = OptionManager.get_instance().get_strategy()

        if strategy_command != None:
            try:
                strategy = eval(strategy_command + "Strategy()")
            except Exception:
                raise RainetException(
                    "Rainet.execute : No strategy associated to keyword " +
                    str(strategy_command))
        else:
            raise RainetException(
                "Rainet.execute : No strategy was defined: aborting")

        try:
            strategy.execute()
        except RainetException as raie:
            Logger.get_instance().error(
                "Rainet.execute: An exception occurred executing the command:\n"
                + raie.to_string())
示例#13
0
    def test_PRI_filter_two(self):

        print "| test_PRI_filter_two | "

        # important to create new SQLManager session if changing database
        SQLManager.get_instance().close_session()

        optionManager = OptionManager.get_instance()
        #        optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite")
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, "28")
        optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES,
                                 "lincRNA")
        optionManager.set_option(OptionConstants.OPTION_GENCODE, 1)
        self.strategy.execute()

        PRIs = DataManager.get_instance().get_data(
            AnalysisStrategy.PRI_FILTER_KW)

        print len(PRIs)

        self.assertTrue(
            len(PRIs) == AnalysisStrategyUnittest.TOTAL_PRIS_LINC_FILT,
            "asserting if PRIs are affected by RNA-level filters")
示例#14
0
                "Rainet.execute: An exception occurred executing the command:\n"
                + raie.to_string())


#===============================================================================
# The main function
#===============================================================================
if __name__ == '__main__':

    try:

        # Create Logger instance by using the first log action.
        Logger.get_instance().info("Rainet : Starting...")

        # Store the options
        OptionManager.get_instance().initialize()

        # Set the level of verbosity
        Logger.get_instance().set_level(
            OptionManager.get_instance().get_option(
                OptionConstants.OPTION_VERBOSITY))

        # Instantiate Rainet with the correct database path
        rainet = Rainet()

        # Insert the data to database
        rainet.execute()

    except RainetException as rainet:
        Logger.get_instance().error(
            "Error during execution of Rainet. Aborting :\n" +
示例#15
0
    def test_expression_warning(self):

        print "| test_expression_warning | "

        optionManager = OptionManager.get_instance()
        # need larger database for this test
        optionManager.set_option(
            OptionConstants.OPTION_DB_NAME,
            "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite"
        )

        # set parameters for expression test
        optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING,
                                 0.8)
        optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION,
                                 1.0)

        # so that only way to have skip test flag is by expression filter
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_PROTEIN_ANNOTATION, 0)
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_PROTEIN_INTERACTION, 0)

        optionManager.set_option(OptionConstants.OPTION_NUMBER_RANDOMIZATIONS,
                                 1)

        # important to create new SQLManager session if changing database
        SQLManager.get_instance().close_session()

        self.run.execute()

        countLines = 0
        countTestPerRNA = 0
        with open(self.outputFolder + "/" +
                  EnrichmentAnalysisStrategy.REPORT_ENRICHMENT) as inFile:
            inFile.readline()
            for line in inFile:
                countLines += 1
                if line.startswith("ENST00000309775\t344"):
                    spl = line.split("\t")
                    # protein-mRNA correspondence "ENST00000321265","Q9Y266"
                    self.assertTrue(
                        spl[5] == "0",
                        "example where protein is expressed in all tissues, therefore it passes the filter"
                    )

                    if line.startswith("ENST00000309775\t351"):
                        spl = line.split("\t")
                        self.assertTrue(
                            spl[5] == "1",
                            "example where the two proteins are not expressed in common tissues, therefore it does not pass the filter"
                        )

                        # expression of RNA
                        #set(['Uterus', 'Brain - Cerebellum', 'Cells - EBV-transformed lymphocytes', 'Brain - Cerebellar Hemisphere'])
                        #{'Uterus': 0, 'Brain - Cerebellar Hemisphere': 0, 'Cells - EBV-transformed lymphocytes': 0, 'Brain - Cerebellum': 0}

                        # interacting/annotated proteins
                        #['P51843', 'Q14994']
                        # expression of one of the proteins
                        # "ENST00000378970","Adrenal Gland","31.103"
                        # "ENST00000378970","Testis","20.62"
                        # "ENST00000378970","Ovary","2.754"
                        # "ENST00000378970","Pituitary","2.505"
                        # "ENST00000378970","Brain - Hypothalamus","1.109"
                        # "ENST00000378970","Brain - Amygdala","1.021"

                        # there is no overlap of tissues.
                        # Note: I did not check other protein, it has too many mRNAs associated to check manually.

                        break

        # important to create new SQLManager session if changing database
        SQLManager.get_instance().close_session()
示例#16
0
    def test_PRI_filter_three(self):

        print "| test_PRI_filter_three | "

        # Overwrite default values
        optionManager = OptionManager.get_instance()
        optionManager.set_option(
            OptionConstants.OPTION_DB_NAME,
            "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite"
        )
        #        optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_backup/RNA/rainet2016-07-07.human_linc_prelim.sqlite")
        optionManager.set_option(
            OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, 100)  # 100
        optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES,
                                 "lincRNA")
        optionManager.set_option(OptionConstants.OPTION_GENCODE, 1)
        optionManager.set_option(
            OptionConstants.OPTION_EXPRESSION_VALUE_CUTOFF, 1.0)  # 1.0
        optionManager.set_option(
            OptionConstants.OPTION_EXPRESSION_TISSUE_CUTOFF, 1.0)  # 1

        # important to create new SQLManager session if changing database
        SQLManager.get_instance().close_session()

        # Run strategy step by step
        self.strategy = AnalysisStrategy()

        self.strategy.execute()

        self.strategy.execute(run=0)

        self.strategy.filter_RNA()
        self.strategy.filter_protein()
        self.strategy.filter_PRI()

        selectedInteractions = DataManager.get_instance().get_data(
            AnalysisStrategy.PRI_FILTER_KW)

        self.assertTrue(
            len(selectedInteractions) == 4298,
            "assert if number of initial interactions is correct")

        # run main function we want to test
        self.strategy.dump_filter_PRI_expression()

        # select count(distinct(proteinID)) from MRNA  --> 57366
        self.assertTrue(
            len(self.strategy.mRNADict) == 57366,
            "assert if number of mRNA to protein correspondence is correct")

        # ENST00000005180 --> Q9Y258, ENST00000394905 --> Q9Y258 #confirmed in Ensembl82 website
        self.assertTrue(
            set(self.strategy.mRNADict["Q9Y258"]) == set(
                ["ENST00000005180", "ENST00000394905"]),
            "assert if a specific mRNA-protein correspondence is correct")

        # select count(distinct(transcriptID)) from RNATIssueExpression --> 184278
        self.assertTrue(
            len(self.strategy.expressionDict) == 184278,
            "assert number of transcripts with expression data is correct")

        #
        tissues = [
            str(tiss[0])
            for tiss in self.sql_session.query(Tissue.tissueName).all()
        ]
        self.assertTrue(
            len(self.strategy.expressionDict["ENST00000394905"]) == len(
                tissues),
            "assert number of expression values match number of tissues ")

        # grep ENST00000394905 transcript_expression_metrics_no_outliers.tsv
        # ENST00000394905    Skin - Sun Exposed (Lower leg)    0.124    0.199    0.000    1.60395658469    0.754
        boo = 0
        for tissTuple in self.strategy.expressionDict["ENST00000394905"]:
            if tissTuple == (0.124, "Skin - Sun Exposed (Lower leg)"):
                boo = 1
        self.assertTrue(
            boo, "assert specific transcript expression data is correct")

        ## Really test expression filter

        # check protein expression of a specific protein
        # "ENST00000294652","Q5VWK0"
        # "ENST00000370040","Q5VWK0"
        # "ENST00000444143","Q5VWK0"
        # "ENST00000495380","Q5VWK0"
        self.assertTrue(
            len(self.strategy.ProtMRNATissueExpressions["Q5VWK0"]) == 4)
        self.assertTrue(self.strategy.ProtMRNATissueExpressions["Q5VWK0"]
                        ["ENST00000370040"]["Liver"] == 0)
        self.assertTrue(self.strategy.ProtMRNATissueExpressions["Q5VWK0"]
                        ["ENST00000495380"]["Testis"] == 3.363)

        # ENST00000495380, an MRNA of Q5VWK0 protein, has expression value > 1.0 RPKM in testis, given by one of the mRNAs
        # This protein is not present in any other tissue
        #"ENST00000294652","Pituitary","0.012"
        #"ENST00000444143","Testis","0.488"
        #"ENST00000495380","Testis","3.363"

        # InteractingRNA we want to test as positive interaction: ENST00000413466
        # "ENST00000413466","Testis","1.006"

        # InteractingRNA we want to test as negative interaction: ENST00000423943
        # "ENST00000423943","Testis","0.512"

        proteinExpressionTissues = DataManager.get_instance().get_data(
            AnalysisStrategy.PROT_TISSUES_KW)
        rnaExpressionTissues = DataManager.get_instance().get_data(
            AnalysisStrategy.RNA_TISSUES_KW)
        expressedInteractionsTissues = DataManager.get_instance().get_data(
            AnalysisStrategy.PRI_TISSUES_KW)

        self.assertTrue("Q5VWK0" in proteinExpressionTissues["Testis"]
                        )  # key -> tissue, value -> set of protein IDs
        self.assertTrue("Q5VWK0" not in proteinExpressionTissues["Pancreas"]
                        )  # key -> tissue, value -> set of protein IDs

        self.assertTrue("ENST00000413466" in rnaExpressionTissues["Testis"]
                        )  # key -> tissue, value -> set of tx IDs
        self.assertTrue(
            "ENST00000413466|Q5VWK0" in expressedInteractionsTissues,
            "assert if interaction passes cutoffs"
        )  # key -> transcriptID|proteinID (pair), value -> set of tissues
        self.assertTrue(
            len(expressedInteractionsTissues["ENST00000413466|Q5VWK0"]) == 1,
            "assert if number of tissues passing cutoff is correct"
        )  # key -> transcriptID|proteinID (pair), value -> set of tissues

        self.assertTrue(
            "ENST00000423943" not in rnaExpressionTissues["Testis"])
        self.assertTrue(
            "ENST00000423943|Q5VWK0" not in expressedInteractionsTissues,
            "assert that interaction is not present"
        )  # key -> transcriptID|proteinID (pair), value -> set of tissues

        newSelectedInteractions = DataManager.get_instance().get_data(
            AnalysisStrategy.PRI_FILTER_KW)

        self.assertTrue(
            len(newSelectedInteractions) <= len(selectedInteractions),
            "expression filtered interactions should be equal or less than initial ones"
        )
示例#17
0
    def insert_data(self):

        # Create Logger instance by using the first log action.
        Logger.get_instance().info(
            "InsertionStrategy.insert_data: Starting...")

        #         # Backup the database file
        #         try:
        #             Logger.get_instance().info( "InsertionStrategy.insert_data:   Backuping DB file..." )
        #             shutil.copyfile(self.DBPath, self.DBPath + ".back")
        #         except IOError as ioe:
        #             Logger.get_instance().info( " warning : Unable to backup database file : " + self.DBPath + " : " + str( ioe))

        # Create database sqlite file at the provided path
        SQLManager.get_instance().build_database(self.DBPath,
                                                 self.forceOverride)

        self.check_database_tables()

        # Retrieve the inertion properties
        PropertyManager.get_instance().read_properties(
            OptionManager.get_instance().get_option(
                OptionConstants.OPTION_INSERTION_PROPERTIES_PATH, True))

        # Start chrono
        Timer.get_instance().start_chrono()

        # Indicate insertion mode
        if self.forceOverride:
            Logger.get_instance().info(" -- MODE FORCE OVERRIDE -- ")
        else:
            Logger.get_instance().info(" -- MODE RESUME -- ")

        #=======================================================================
        # INSERTION OF DATA
        #=======================================================================
        try:

            #===================================================================
            # PROTEIN DEFINITION
            #===================================================================

            # Parse the protein file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.PROTEIN_UNIPROT_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, True,
                                      DataConstants.PROTEIN_HEADERS,
                                      DataConstants.PROTEIN_CLASS,
                                      DataConstants.PROTEIN_PARAMS, None,
                                      DataConstants.PROTEIN_COMMENT_CHAR)

            # Parse the protein cross references file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.PROTEIN_CROSSREFERENCES_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False,
                DataConstants.PROTEIN_CROSS_REFERENCE_HEADERS,
                DataConstants.PROTEIN_CROSS_REFERENCE_CLASS,
                DataConstants.PROTEIN_CROSS_REFERENCE_PARAMS, None,
                DataConstants.PROTEIN_CROSS_REFERENCE_COMMENT_CHAR)

            # Parse the protein isoform file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.PROTEIN_ISOFORMS_PROPERTY, True)
            self.launch_insertion_Fasta(
                input_file, DataConstants.ISOFORM_CLASS,
                DataConstants.ISOFORM_REGULAR_EXPRESSION,
                DataConstants.ISOFORM_GROUPS, DataConstants.ISOFORM_PARAMS,
                DataConstants.ISOFORM_PARAMS_VALUE_ALTERNATIVE,
                DataConstants.ISOFORM_COMMENT_CHAR)

            # Parse the protein domain file of SMART DB
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.PROTEIN_DOMAIN_SMART_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, True, DataConstants.PROTEIN_DOMAIN_HEADERS_SMART,
                DataConstants.PROTEIN_DOMAIN_CLASS,
                DataConstants.PROTEIN_DOMAIN_PARAM_SMART,
                DataConstants.PROTEIN_DOMAIN_VALUE_SMART,
                DataConstants.PROTEIN_DOMAIN_COMMENT_CHAR, "SMART", False)

            # Parse the protein domain file of PFAM DB
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.PROTEIN_DOMAIN_PFAM_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.PROTEIN_DOMAIN_HEADERS_PFAM,
                DataConstants.PROTEIN_DOMAIN_CLASS,
                DataConstants.PROTEIN_DOMAIN_PARAM_PFAM,
                DataConstants.PROTEIN_DOMAIN_VALUE_PFAM,
                DataConstants.PROTEIN_DOMAIN_COMMENT_CHAR, "PFAM", False)

            #===================================================================
            # FUNCTION AND PATHWAY ANNOTATIONS
            #===================================================================

            # Parse the Gene Ontology file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.GENE_ONTOLOGY_DEFINITION_PROPERTY, True)
            self.launch_insertion_Obo(
                input_file, DataConstants.GENE_ONTOLOGY_CLASS,
                DataConstants.GENE_ONTOLOGY_ID_TAG,
                DataConstants.GENE_ONTOLOGY_NAME_TAG,
                DataConstants.GENE_ONTOLOGY_NAMESPACE_TAG)

            # Parse the Protein Gene Ontology annotation file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.GENE_ONTOLOGY_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.PROTEIN_GO_ANNOTATION_HEADERS,
                DataConstants.PROTEIN_GO_ANNOTATION_CLASS,
                DataConstants.PROTEIN_GO_ANNOTATION_PARAMS, None,
                DataConstants.PROTEIN_GO_ANNOTATION_COMMENT_CHAR)

            # Parse the KEGG pathway file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.KEGG_PATHWAY_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, True,
                                      DataConstants.KEGG_PATHWAY_HEADERS,
                                      DataConstants.KEGG_PATHWAY_CLASS,
                                      DataConstants.KEGG_PATHWAY_PARAMS, None,
                                      DataConstants.KEGG_PATHWAY_COMMENT_CHAR)

            # Parse the Protein KEGG Pathway annotation file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.KEGG_PATHWAY_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, True,
                DataConstants.KEGG_PATHWAY_ANNOTATION_HEADERS,
                DataConstants.KEGG_PATHWAY_ANNOTATION_CLASS,
                DataConstants.KEGG_PATHWAY_ANNOTATION_PARAMS, None,
                DataConstants.KEGG_PATHWAY_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # REACTOME
            #===================================================================

            # Parse the Reactome pathway file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.REACTOME_PATHWAY_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.REACTOME_PATHWAY_HEADERS,
                DataConstants.REACTOME_PATHWAY_CLASS,
                DataConstants.REACTOME_PATHWAY_PARAMS, None,
                DataConstants.REACTOME_PATHWAY_COMMENT_CHAR)

            # Parse the Protein Reactome Pathway annotation file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.REACTOME_PATHWAY_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False,
                DataConstants.REACTOME_PATHWAY_ANNOTATION_HEADERS,
                DataConstants.REACTOME_PATHWAY_ANNOTATION_CLASS,
                DataConstants.REACTOME_PATHWAY_ANNOTATION_PARAMS, None,
                DataConstants.REACTOME_PATHWAY_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # BIOPLEX
            #===================================================================

            # Parse the file listing Bioplex clusters
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.BIOPLEX_CLUSTER_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.BIOPLEX_CLUSTER_HEADERS,
                DataConstants.BIOPLEX_CLUSTER_CLASS,
                DataConstants.BIOPLEX_CLUSTER_PARAMS, None,
                DataConstants.BIOPLEX_CLUSTER_COMMENT_CHAR)

            # Parse the file with Bioplex annotations
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.BIOPLEX_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.BIOPLEX_ANNOTATION_HEADERS,
                DataConstants.BIOPLEX_ANNOTATION_CLASS,
                DataConstants.BIOPLEX_ANNOTATION_PARAMS, None,
                DataConstants.BIOPLEX_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # WAN CLUSTERS
            #===================================================================

            # Parse the file listing Wan clusters
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.WAN_CLUSTER_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, False,
                                      DataConstants.WAN_CLUSTER_HEADERS,
                                      DataConstants.WAN_CLUSTER_CLASS,
                                      DataConstants.WAN_CLUSTER_PARAMS, None,
                                      DataConstants.WAN_CLUSTER_COMMENT_CHAR)

            # Parse the file with Wan annotations
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.WAN_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.WAN_ANNOTATION_HEADERS,
                DataConstants.WAN_ANNOTATION_CLASS,
                DataConstants.WAN_ANNOTATION_PARAMS, None,
                DataConstants.WAN_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # CORUM CLUSTERS
            #===================================================================

            # Parse the file listing Corum clusters
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.CORUM_CLUSTER_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, False,
                                      DataConstants.CORUM_CLUSTER_HEADERS,
                                      DataConstants.CORUM_CLUSTER_CLASS,
                                      DataConstants.CORUM_CLUSTER_PARAMS, None,
                                      DataConstants.CORUM_CLUSTER_COMMENT_CHAR)

            # Parse the file with Corum annotations
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.CORUM_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.CORUM_ANNOTATION_HEADERS,
                DataConstants.CORUM_ANNOTATION_CLASS,
                DataConstants.CORUM_ANNOTATION_PARAMS, None,
                DataConstants.CORUM_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # CUSTOM CLUSTERS
            #===================================================================

            # Parse the file listing Custom clusters
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.CUSTOM_CLUSTER_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.CUSTOM_CLUSTER_HEADERS,
                DataConstants.CUSTOM_CLUSTER_CLASS,
                DataConstants.CUSTOM_CLUSTER_PARAMS, None,
                DataConstants.CUSTOM_CLUSTER_COMMENT_CHAR)

            # Parse the file with Custom annotations
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.CUSTOM_ANNOTATION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.CUSTOM_ANNOTATION_HEADERS,
                DataConstants.CUSTOM_ANNOTATION_CLASS,
                DataConstants.CUSTOM_ANNOTATION_PARAMS, None,
                DataConstants.CUSTOM_ANNOTATION_COMMENT_CHAR)

            #===================================================================
            # INTERACTOME
            #===================================================================

            # Parse the protein interaction file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.INTERACTOME_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, False,
                                      DataConstants.INTERACTOME_HEADER,
                                      DataConstants.INTERACTOME_CLASS,
                                      DataConstants.INTERACTOME_PARAMS, None,
                                      DataConstants.INTERACTOME_COMMENT_CHAR)

            # Parse the protein interaction network file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.INTERACTOME_NETWORK_DEFINITION_PROPERTY, True)
            ppi_default_values = [None, None, os.path.basename(input_file)]
            self.launch_insertion_TSV(
                input_file, False, DataConstants.INTERACTOME_NETWORK_HEADER,
                DataConstants.INTERACTOME_NETWORK_CLASS,
                DataConstants.INTERACTOME_NETWORK_PARAMS, ppi_default_values,
                DataConstants.INTERACTOME_NETWORK_COMMENT_CHAR)

            # Parse the Network Module file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_DEFINITION_PROPERTY, True)
            self.launch_insertion_NetworkModule(
                input_file, DataConstants.INTERACTOME_NETWORK_PARTITION_CLASS,
                DataConstants.INTERACTOME_NETWORK_PARTITION_CLASS_TAG,
                DataConstants.INTERACTOME_NETWORK_PARTITION_COMMENT_CHAR)

            # Parse the Network Module Annotation file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_ANNOTATION_PROPERTY, True)
            self.launch_insertion_NetworkModuleAnnotation(
                input_file,
                DataConstants.INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS,
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS_TAG,
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS_REGEX,
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_ANNOTATION_PROTEIN_TAG,
                DataConstants.
                INTERACTOME_NETWORK_PARTITION_ANNOTATION_ANNOTATION_TAG,
                DataConstants.INTERACTOME_NETWORK_PARTITION_COMMENT_CHAR)

            # Parse the protein redundancy file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.
                INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_PROPERTY, True)
            interactome_network_redundancy_definition_value = [
                None, basename(input_file), None
            ]
            self.launch_insertion_TSV(
                input_file, False, DataConstants.
                INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_HEADERS,
                DataConstants.INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_CLASS,
                DataConstants.INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_PARAMS,
                interactome_network_redundancy_definition_value, DataConstants.
                INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_COMMENT_CHAR,
                "Redundancy", False)

            #===================================================================
            # RNA DEFINITION
            #===================================================================

            # Make query of specific type of protein cross references to speed up insertion
            DataManager.get_instance().perform_query(
                DataConstants.PROTEIN_ENSP_XREF_KW,
                "query( ProteinCrossReference.protein_id,ProteinCrossReference.crossReferenceID ).filter(ProteinCrossReference.sourceDB == DataConstants.PROTEIN_ENSP_XREF_DB).all()"
            )
            # Convert query into a dictionary
            DataManager.get_instance().query_to_dict(
                DataConstants.PROTEIN_ENSP_XREF_KW, 1, 0)

            # Parse the RNA file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.RNA_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(input_file, True,
                                      DataConstants.RNA_HEADERS,
                                      DataConstants.RNA_CLASS,
                                      DataConstants.RNA_PARAMS, None,
                                      DataConstants.RNA_COMMENT_CHAR)

            # Parse the RNA cross references file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.RNA_CROSS_REFERENCE_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False, DataConstants.RNA_CROSS_REFERENCE_HEADERS,
                DataConstants.RNA_CROSS_REFERENCE_CLASS,
                DataConstants.RNA_CROSS_REFERENCE_PARAMS, None,
                DataConstants.RNA_CROSS_REFERENCE_COMMENT_CHAR)

            #===================================================================
            # RNA TISSUE EXPRESSION
            #===================================================================

            # Make query of all RNA IDs to speed up insertion
            DataManager.get_instance().perform_query(DataConstants.RNA_ALL_KW,
                                                     "query( RNA ).all()")
            # Format query into dict data structure
            DataManager.get_instance().query_to_object_dict(
                DataConstants.RNA_ALL_KW, "transcriptID")

            # Make query of all Protein IDs (uniprotAC) to speed up insertion
            DataManager.get_instance().perform_query(DataConstants.PROT_ALL_KW,
                                                     "query( Protein ).all()")
            # Format query into dict data structure
            DataManager.get_instance().query_to_object_dict(
                DataConstants.PROT_ALL_KW, "uniprotAC")

            # Parse the RNA tissue expression file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.RNA_TISSUE_EXPRESSION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, True, DataConstants.RNA_TISSUE_EXPRESSION_HEADERS,
                DataConstants.RNA_TISSUE_EXPRESSION_CLASS,
                DataConstants.RNA_TISSUE_EXPRESSION_PARAMS,
                DataConstants.RNA_TISSUE_EXPRESSION_VALUE,
                DataConstants.RNA_TISSUE_EXPRESSION_COMMENT_CHAR)

            #===================================================================
            # PROTEIN RNA INTERACTION
            #===================================================================

            self.forceOverride = 1

            # Parse the file listing RNA with catRAPID data
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.INTERACTING_RNA_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, True,
                DataConstants.INTERACTING_RNA_DEFINITION_HEADERS,
                DataConstants.INTERACTING_RNA_DEFINITION_CLASS,
                DataConstants.INTERACTING_RNA_DEFINITION_PARAMS, None,
                DataConstants.INTERACTING_RNA_DEFINITION_COMMENT_CHAR)

            # Parse the file listing Proteins with catRAPID data
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.INTERACTING_PROTEIN_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, True,
                DataConstants.INTERACTING_PROTEIN_DEFINITION_HEADERS,
                DataConstants.INTERACTING_PROTEIN_DEFINITION_CLASS,
                DataConstants.INTERACTING_PROTEIN_DEFINITION_PARAMS, None,
                DataConstants.INTERACTING_PROTEIN_DEFINITION_COMMENT_CHAR)

            # Initialize data items to store missing interactions
            if DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_RNA_KW not in DataManager.get_instance(
            ).data:
                DataManager.get_instance().store_data(
                    DataConstants.
                    PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_RNA_KW, [])
            if DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_PROT_KW not in DataManager.get_instance(
            ).data:
                DataManager.get_instance().store_data(
                    DataConstants.
                    PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_PROT_KW, [])

            # Parse the ProteinRNAInteractionCatRAPID file
            input_file = PropertyManager.get_instance().get_property(
                DataConstants.
                PROTEIN_RNA_INTERACTION_CATRAPID_DEFINITION_PROPERTY, True)
            self.launch_insertion_TSV(
                input_file, False,
                DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_HEADERS,
                DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_CLASS,
                DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_PARAMS, None,
                DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_COMMENT_CHAR)

            self.forceOverride = 0

            # Remove data that will no longer be used to reduce memory usage
            DataManager.get_instance().delete_data(
                DataConstants.PROTEIN_ENSP_XREF_KW)
            DataManager.get_instance().delete_data(DataConstants.RNA_ALL_KW)
            DataManager.get_instance().delete_data(DataConstants.PROT_ALL_KW)

        except RainetException as re:
            Logger.get_instance().error(re.to_string())
            Timer.get_instance().stop_chrono("ERROR : Data insertion FAILED")
            return

        # # Report on potential missing data
        # self.check_missing_data()

        # Stop the chrono
        Timer.get_instance().stop_chrono("Data insertion finished")