def execute( self ): # Get the option values self.DBPath = OptionManager.get_instance().get_option( OptionConstants.OPTION_DB_NAME ) query_file_path = OptionManager.get_instance().get_option( OptionConstants.OPTION_QUERY_FILE ) species = OptionManager.get_instance().get_option( OptionConstants.OPTION_SPECIES ) # Check if a species has been provided if species == None or len( species) == 0: raise RainetException( "InteractiveQueryStrategy.execute: You must specify a species in your command line. Please check help.") # Build a SQL session to DB SQLManager.get_instance().set_DBpath( self.DBPath) sql_session = SQLManager.get_instance().get_session() # Check if the species in the DB correspond to the species given by the user SQLManager.check_species(sql_session, species, True) # Read the query from file Logger.get_instance().info("InteractiveQueryStrategy.execute : Reading query...") query_string = self.read_query( query_file_path) # Get the result of the query on DB Logger.get_instance().info("InteractiveQueryStrategy.execute : Querying database...") query_result = self.perform_query( sql_session, query_string) # Export query result to file Logger.get_instance().info("InteractiveQueryStrategy.execute : Exporting result...") self.export_query_result( query_result, query_file_path) Logger.get_instance().info("InteractiveQueryStrategy.execute : Finished.")
def execute(self): self.DBPath = OptionManager.get_instance().get_option( OptionConstants.OPTION_DB_NAME) self.forceOverride = OptionManager.get_instance().get_option( OptionConstants.OPTION_INSERTION_FORCE_OVERRIDE) self.insert_data()
def setUp(self): # Set the options # Note: if running from command line / main script, the optionManager gets the default values, # but in unittest we must set up all arguments, whether optional or not. # In the actual unittests I may override the default options, for testing. optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_VERBOSITY, "debug") optionManager.set_option( OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet_testing_DB.sqlite" ) #rainet2016-06-17.human_expression_wPRI.sqlite") optionManager.set_option(OptionConstants.OPTION_SPECIES, "human") optionManager.set_option( OptionConstants.OPTION_OUTPUT_FOLDER, "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_results/enrichmentAnalysis/" ) optionManager.set_option(OptionConstants.OPTION_ANNOTATION_TABLE, "NetworkModule") optionManager.set_option( OptionConstants.OPTION_MINIMUM_PROTEIN_ANNOTATION, OptionConstants.DEFAULT_MINIMUM_PROTEIN_ANNOTATION) optionManager.set_option( OptionConstants.OPTION_MINIMUM_PROTEIN_INTERACTION, OptionConstants.DEFAULT_MINIMUM_PROTEIN_INTERACTION) optionManager.set_option(OptionConstants.OPTION_NUMBER_RANDOMIZATIONS, OptionConstants.DEFAULT_NUMBER_RANDOMIZATIONS) optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING, OptionConstants.DEFAULT_EXPRESSION_WARNING) optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION, OptionConstants.DEFAULT_MINIMUM_EXPRESSION) optionManager.set_option(OptionConstants.OPTION_LOWER_TAIL, OptionConstants.DEFAULT_LOWER_TAIL) # Set the level of verbosity Logger.get_instance().set_level( OptionManager.get_instance().get_option( OptionConstants.OPTION_VERBOSITY)) # Setting up SQL manager SQLManager.get_instance().set_DBpath( OptionManager.get_instance().get_option( OptionConstants.OPTION_DB_NAME)) self.sql_session = SQLManager.get_instance().get_session() # setting up internal test folder paths self.expectedFolder = "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/enrichmentAnalysis/test_expected" self.outputFolder = OptionManager.get_instance().get_option( OptionConstants.OPTION_OUTPUT_FOLDER) # create instance of strategy self.run = EnrichmentAnalysisStrategy() self.run.sql_session = SQLManager.get_instance().get_session()
def setUp(self): # Set the options # Note: if running from command line / main script, the optionManager gets the default values, # but in unittest we must set up all arguments, whether optional or not. # In the actual unittests I may override the default options, for testing. optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_VERBOSITY, "debug") # optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite") optionManager.set_option(OptionConstants.OPTION_DB_NAME, DB_PATH) optionManager.set_option(OptionConstants.OPTION_SPECIES, "human") optionManager.set_option( OptionConstants.OPTION_OUTPUT_FOLDER, "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_results/" ) optionManager.set_option( OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, OptionConstants.DEFAULT_INTERACTION_SCORE) optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES, OptionConstants.DEFAULT_RNA_BIOTYPES) optionManager.set_option(OptionConstants.OPTION_GENCODE, OptionConstants.DEFAULT_GENCODE) optionManager.set_option( OptionConstants.OPTION_EXPRESSION_VALUE_CUTOFF, OptionConstants.DEFAULT_EXPRESSION_VALUE_CUTOFF) optionManager.set_option( OptionConstants.OPTION_EXPRESSION_TISSUE_CUTOFF, OptionConstants.DEFAULT_EXPRESSION_TISSUE_CUTOFF) optionManager.set_option(OptionConstants.OPTION_LOW_MEMORY, OptionConstants.DEFAULT_LOW_MEMORY) # Set the level of verbosity Logger.get_instance().set_level( OptionManager.get_instance().get_option( OptionConstants.OPTION_VERBOSITY)) # Setting up SQL manager SQLManager.get_instance().set_DBpath(DB_PATH) self.sql_session = SQLManager.get_instance().get_session() # setting up internal test folder paths self.expectedFolder = "/home/diogo/workspace/tagc-rainet-RNA/test/fr/tagc/rainet/core/test_expected/Report" self.outputFolder = OptionManager.get_instance().get_option( OptionConstants.OPTION_OUTPUT_FOLDER) + "/Report/" # create instance of strategy self.strategy = AnalysisStrategy() # report only written for selected tests self.strategy.writeReportFile = 0
def test_RNA_filter_two(self): print "| test_RNA_filter_two | " optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES, "lincRNA") self.strategy.execute() lincRNAs = DataManager.get_instance().get_data( AnalysisStrategy.RNA_FILTER_KW) self.assertTrue( len(lincRNAs) == 9, "asserting if number of objects retrieved is correct") for lincRNA in lincRNAs: self.assertTrue( isinstance(lincRNA, LncRNA), "check if the lncRNA is instance of LncRNA table/class") response = self.sql_session.query(LncRNA).filter( LncRNA.transcriptBiotype == "lincRNA").all() self.assertTrue( len(lincRNAs) == len(response), "asserting if number of objects in lncRNA default (off) option is same as querying directly lncRNA table" )
def test_annotation_report(self): print "| test_annotation_report | " self.run.execute() # For NetworkModule # Background should be overlap between interactingProteins and Proteins with annotation #SELECT count(distinct( InteractingProtein.uniprotAC)) FROM InteractingProtein, ProteinNetworkModule WHERE InteractingProtein.uniprotAC == ProteinNetworkModule.protein_id self.assertTrue( len(self.run.backgroundProteins) == 3, "assert number of background proteins is correctly calculated") #self.assertTrue( len( self.run.backgroundProteins) == 37, "assert number of background proteins is correctly calculated") #SELECT count(distinct(ProteinNetworkModule.protein_id)) FROM ProteinNetworkModule -> 42 self.assertTrue( self.run.protAnnotDictLen == 42, "assert number of proteins with annotations is correctly calculated" ) self.assertTrue( self.run.allProteinsWithInteractionDataLen == len( DataManager.get_instance().get_data( EnrichmentAnalysisStrategy.PRI_PROT_KW)), "assert number of proteins with interactions is correctly calculated" ) #SELECT count(distinct(ProteinNetworkModule.networkModule_id)) FROM ProteinNetworkModule -> 82. But this does not count interactions. pool = { prot for annot in self.run.annotWithInteractionDict for prot in self.run.annotWithInteractionDict[annot] } #self.assertTrue( len( pool) == len( self.run.backgroundProteins), "confirm number of proteins with annotation and interaction") # For KEGG Pathway optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_ANNOTATION_TABLE, "KEGGPathway") self.run.execute() #SELECT count(distinct(ProteinKEGGAnnotation.protein_id)) FROM ProteinKEGGAnnotation -> 22 self.assertTrue( self.run.protAnnotDictLen == 22, "assert number of proteins with annotations is correctly calculated" ) self.assertTrue( self.run.allProteinsWithInteractionDataLen == len( DataManager.get_instance().get_data( EnrichmentAnalysisStrategy.PRI_PROT_KW)), "assert number of proteins with interactions is correctly calculated" )
def test_RNA_filter_four(self): print "| test_RNA_filter_four | " optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_GENCODE, "1") self.strategy.execute() RNAs = DataManager.get_instance().get_data( AnalysisStrategy.RNA_FILTER_KW) self.assertTrue( len(RNAs) == 70, "asserting if number of object with gencode is correct")
def test_PRI_filter_one(self): print "| test_PRI_filter_one | " optionManager = OptionManager.get_instance() optionManager.set_option( OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, "10.0") self.strategy.execute() PRIs = DataManager.get_instance().get_data( AnalysisStrategy.PRI_FILTER_KW) # Regarding peptide redundancy filter, there is two of such cases in test database, one is misc_RNA (is always filtered) and the other is lincRNA, so only one peptide (interaction) is removed self.assertTrue( len(PRIs) == 9, "asserting if number of interactions above certain interaction score is correct" )
def test_retrieve_expression(self): print "| test_retrieve_expression | " optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING, 0.8) optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION, 1.0) self.run.execute() self.run.retrieve_expression() rna_id = "ENST00000421534" self.assertTrue("P62826" in self.run.protTissueExpressions) self.assertTrue(len(self.run.protTissueExpressions["P62826"]) == 49) self.assertTrue(len(self.run.expressionDict[rna_id]) == 2)
def build_database(self, path, force_override): # Get the value of database species specified by the user species = OptionManager.get_instance().get_option( OptionConstants.OPTION_SPECIES) if species == None or len(species) == 0: raise RainetException( "SQLManager.build_database: You must specify a species in your command line. Please check help." ) # Remove the database file if required if force_override: self.remove_database_file(path) # Create engine to dedicated database engine = self.create_engine(path) # Open the DB session session = sessionmaker() session.configure(bind=engine) # Look if the Protein table exists. If not, it means DB model was not created model_exists = True try: table_content = session().query(DataConstants.PROTEIN_CLASS).all() if table_content == None: model_exists = False except Exception: model_exists = False # Create all the required table in DB according to class model # if required (override mode or model does not exist in DB) if force_override or not model_exists: Base.metadata.create_all(engine) sql_session = session() SQLManager.check_species(sql_session, species, True) sql_session.close() # Keep the DB path self.DBPath = path Logger.get_instance().info('Database File created/used : ' + str(path))
def test_RNA_filter_one(self): print "| test_RNA_filter_one | " optionManager = OptionManager.get_instance() optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES, "protein_coding") self.strategy.execute() mRNAs = DataManager.get_instance().get_data( AnalysisStrategy.RNA_FILTER_KW) self.assertTrue( len(mRNAs) == 82, "asserting if number of objects retrieved is correct") for mRNA in mRNAs: self.assertTrue( isinstance(mRNA, MRNA), "check if the mRNA is instance of MRNA table/class")
def execute(self): strategy_command = OptionManager.get_instance().get_strategy() if strategy_command != None: try: strategy = eval(strategy_command + "Strategy()") except Exception: raise RainetException( "Rainet.execute : No strategy associated to keyword " + str(strategy_command)) else: raise RainetException( "Rainet.execute : No strategy was defined: aborting") try: strategy.execute() except RainetException as raie: Logger.get_instance().error( "Rainet.execute: An exception occurred executing the command:\n" + raie.to_string())
def test_PRI_filter_two(self): print "| test_PRI_filter_two | " # important to create new SQLManager session if changing database SQLManager.get_instance().close_session() optionManager = OptionManager.get_instance() # optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite") optionManager.set_option( OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, "28") optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES, "lincRNA") optionManager.set_option(OptionConstants.OPTION_GENCODE, 1) self.strategy.execute() PRIs = DataManager.get_instance().get_data( AnalysisStrategy.PRI_FILTER_KW) print len(PRIs) self.assertTrue( len(PRIs) == AnalysisStrategyUnittest.TOTAL_PRIS_LINC_FILT, "asserting if PRIs are affected by RNA-level filters")
"Rainet.execute: An exception occurred executing the command:\n" + raie.to_string()) #=============================================================================== # The main function #=============================================================================== if __name__ == '__main__': try: # Create Logger instance by using the first log action. Logger.get_instance().info("Rainet : Starting...") # Store the options OptionManager.get_instance().initialize() # Set the level of verbosity Logger.get_instance().set_level( OptionManager.get_instance().get_option( OptionConstants.OPTION_VERBOSITY)) # Instantiate Rainet with the correct database path rainet = Rainet() # Insert the data to database rainet.execute() except RainetException as rainet: Logger.get_instance().error( "Error during execution of Rainet. Aborting :\n" +
def test_expression_warning(self): print "| test_expression_warning | " optionManager = OptionManager.get_instance() # need larger database for this test optionManager.set_option( OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite" ) # set parameters for expression test optionManager.set_option(OptionConstants.OPTION_EXPRESSION_WARNING, 0.8) optionManager.set_option(OptionConstants.OPTION_MINIMUM_EXPRESSION, 1.0) # so that only way to have skip test flag is by expression filter optionManager.set_option( OptionConstants.OPTION_MINIMUM_PROTEIN_ANNOTATION, 0) optionManager.set_option( OptionConstants.OPTION_MINIMUM_PROTEIN_INTERACTION, 0) optionManager.set_option(OptionConstants.OPTION_NUMBER_RANDOMIZATIONS, 1) # important to create new SQLManager session if changing database SQLManager.get_instance().close_session() self.run.execute() countLines = 0 countTestPerRNA = 0 with open(self.outputFolder + "/" + EnrichmentAnalysisStrategy.REPORT_ENRICHMENT) as inFile: inFile.readline() for line in inFile: countLines += 1 if line.startswith("ENST00000309775\t344"): spl = line.split("\t") # protein-mRNA correspondence "ENST00000321265","Q9Y266" self.assertTrue( spl[5] == "0", "example where protein is expressed in all tissues, therefore it passes the filter" ) if line.startswith("ENST00000309775\t351"): spl = line.split("\t") self.assertTrue( spl[5] == "1", "example where the two proteins are not expressed in common tissues, therefore it does not pass the filter" ) # expression of RNA #set(['Uterus', 'Brain - Cerebellum', 'Cells - EBV-transformed lymphocytes', 'Brain - Cerebellar Hemisphere']) #{'Uterus': 0, 'Brain - Cerebellar Hemisphere': 0, 'Cells - EBV-transformed lymphocytes': 0, 'Brain - Cerebellum': 0} # interacting/annotated proteins #['P51843', 'Q14994'] # expression of one of the proteins # "ENST00000378970","Adrenal Gland","31.103" # "ENST00000378970","Testis","20.62" # "ENST00000378970","Ovary","2.754" # "ENST00000378970","Pituitary","2.505" # "ENST00000378970","Brain - Hypothalamus","1.109" # "ENST00000378970","Brain - Amygdala","1.021" # there is no overlap of tissues. # Note: I did not check other protein, it has too many mRNAs associated to check manually. break # important to create new SQLManager session if changing database SQLManager.get_instance().close_session()
def test_PRI_filter_three(self): print "| test_PRI_filter_three | " # Overwrite default values optionManager = OptionManager.get_instance() optionManager.set_option( OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_testing/rainet2016-06-17.human_expression_wPRI.sqlite" ) # optionManager.set_option(OptionConstants.OPTION_DB_NAME, "/home/diogo/Documents/RAINET_data/TAGC/rainetDatabase/db_backup/RNA/rainet2016-07-07.human_linc_prelim.sqlite") optionManager.set_option( OptionConstants.OPTION_MINIMUM_INTERACTION_SCORE, 100) # 100 optionManager.set_option(OptionConstants.OPTION_RNA_BIOTYPES, "lincRNA") optionManager.set_option(OptionConstants.OPTION_GENCODE, 1) optionManager.set_option( OptionConstants.OPTION_EXPRESSION_VALUE_CUTOFF, 1.0) # 1.0 optionManager.set_option( OptionConstants.OPTION_EXPRESSION_TISSUE_CUTOFF, 1.0) # 1 # important to create new SQLManager session if changing database SQLManager.get_instance().close_session() # Run strategy step by step self.strategy = AnalysisStrategy() self.strategy.execute() self.strategy.execute(run=0) self.strategy.filter_RNA() self.strategy.filter_protein() self.strategy.filter_PRI() selectedInteractions = DataManager.get_instance().get_data( AnalysisStrategy.PRI_FILTER_KW) self.assertTrue( len(selectedInteractions) == 4298, "assert if number of initial interactions is correct") # run main function we want to test self.strategy.dump_filter_PRI_expression() # select count(distinct(proteinID)) from MRNA --> 57366 self.assertTrue( len(self.strategy.mRNADict) == 57366, "assert if number of mRNA to protein correspondence is correct") # ENST00000005180 --> Q9Y258, ENST00000394905 --> Q9Y258 #confirmed in Ensembl82 website self.assertTrue( set(self.strategy.mRNADict["Q9Y258"]) == set( ["ENST00000005180", "ENST00000394905"]), "assert if a specific mRNA-protein correspondence is correct") # select count(distinct(transcriptID)) from RNATIssueExpression --> 184278 self.assertTrue( len(self.strategy.expressionDict) == 184278, "assert number of transcripts with expression data is correct") # tissues = [ str(tiss[0]) for tiss in self.sql_session.query(Tissue.tissueName).all() ] self.assertTrue( len(self.strategy.expressionDict["ENST00000394905"]) == len( tissues), "assert number of expression values match number of tissues ") # grep ENST00000394905 transcript_expression_metrics_no_outliers.tsv # ENST00000394905 Skin - Sun Exposed (Lower leg) 0.124 0.199 0.000 1.60395658469 0.754 boo = 0 for tissTuple in self.strategy.expressionDict["ENST00000394905"]: if tissTuple == (0.124, "Skin - Sun Exposed (Lower leg)"): boo = 1 self.assertTrue( boo, "assert specific transcript expression data is correct") ## Really test expression filter # check protein expression of a specific protein # "ENST00000294652","Q5VWK0" # "ENST00000370040","Q5VWK0" # "ENST00000444143","Q5VWK0" # "ENST00000495380","Q5VWK0" self.assertTrue( len(self.strategy.ProtMRNATissueExpressions["Q5VWK0"]) == 4) self.assertTrue(self.strategy.ProtMRNATissueExpressions["Q5VWK0"] ["ENST00000370040"]["Liver"] == 0) self.assertTrue(self.strategy.ProtMRNATissueExpressions["Q5VWK0"] ["ENST00000495380"]["Testis"] == 3.363) # ENST00000495380, an MRNA of Q5VWK0 protein, has expression value > 1.0 RPKM in testis, given by one of the mRNAs # This protein is not present in any other tissue #"ENST00000294652","Pituitary","0.012" #"ENST00000444143","Testis","0.488" #"ENST00000495380","Testis","3.363" # InteractingRNA we want to test as positive interaction: ENST00000413466 # "ENST00000413466","Testis","1.006" # InteractingRNA we want to test as negative interaction: ENST00000423943 # "ENST00000423943","Testis","0.512" proteinExpressionTissues = DataManager.get_instance().get_data( AnalysisStrategy.PROT_TISSUES_KW) rnaExpressionTissues = DataManager.get_instance().get_data( AnalysisStrategy.RNA_TISSUES_KW) expressedInteractionsTissues = DataManager.get_instance().get_data( AnalysisStrategy.PRI_TISSUES_KW) self.assertTrue("Q5VWK0" in proteinExpressionTissues["Testis"] ) # key -> tissue, value -> set of protein IDs self.assertTrue("Q5VWK0" not in proteinExpressionTissues["Pancreas"] ) # key -> tissue, value -> set of protein IDs self.assertTrue("ENST00000413466" in rnaExpressionTissues["Testis"] ) # key -> tissue, value -> set of tx IDs self.assertTrue( "ENST00000413466|Q5VWK0" in expressedInteractionsTissues, "assert if interaction passes cutoffs" ) # key -> transcriptID|proteinID (pair), value -> set of tissues self.assertTrue( len(expressedInteractionsTissues["ENST00000413466|Q5VWK0"]) == 1, "assert if number of tissues passing cutoff is correct" ) # key -> transcriptID|proteinID (pair), value -> set of tissues self.assertTrue( "ENST00000423943" not in rnaExpressionTissues["Testis"]) self.assertTrue( "ENST00000423943|Q5VWK0" not in expressedInteractionsTissues, "assert that interaction is not present" ) # key -> transcriptID|proteinID (pair), value -> set of tissues newSelectedInteractions = DataManager.get_instance().get_data( AnalysisStrategy.PRI_FILTER_KW) self.assertTrue( len(newSelectedInteractions) <= len(selectedInteractions), "expression filtered interactions should be equal or less than initial ones" )
def insert_data(self): # Create Logger instance by using the first log action. Logger.get_instance().info( "InsertionStrategy.insert_data: Starting...") # # Backup the database file # try: # Logger.get_instance().info( "InsertionStrategy.insert_data: Backuping DB file..." ) # shutil.copyfile(self.DBPath, self.DBPath + ".back") # except IOError as ioe: # Logger.get_instance().info( " warning : Unable to backup database file : " + self.DBPath + " : " + str( ioe)) # Create database sqlite file at the provided path SQLManager.get_instance().build_database(self.DBPath, self.forceOverride) self.check_database_tables() # Retrieve the inertion properties PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_INSERTION_PROPERTIES_PATH, True)) # Start chrono Timer.get_instance().start_chrono() # Indicate insertion mode if self.forceOverride: Logger.get_instance().info(" -- MODE FORCE OVERRIDE -- ") else: Logger.get_instance().info(" -- MODE RESUME -- ") #======================================================================= # INSERTION OF DATA #======================================================================= try: #=================================================================== # PROTEIN DEFINITION #=================================================================== # Parse the protein file input_file = PropertyManager.get_instance().get_property( DataConstants.PROTEIN_UNIPROT_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, True, DataConstants.PROTEIN_HEADERS, DataConstants.PROTEIN_CLASS, DataConstants.PROTEIN_PARAMS, None, DataConstants.PROTEIN_COMMENT_CHAR) # Parse the protein cross references file input_file = PropertyManager.get_instance().get_property( DataConstants.PROTEIN_CROSSREFERENCES_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.PROTEIN_CROSS_REFERENCE_HEADERS, DataConstants.PROTEIN_CROSS_REFERENCE_CLASS, DataConstants.PROTEIN_CROSS_REFERENCE_PARAMS, None, DataConstants.PROTEIN_CROSS_REFERENCE_COMMENT_CHAR) # Parse the protein isoform file input_file = PropertyManager.get_instance().get_property( DataConstants.PROTEIN_ISOFORMS_PROPERTY, True) self.launch_insertion_Fasta( input_file, DataConstants.ISOFORM_CLASS, DataConstants.ISOFORM_REGULAR_EXPRESSION, DataConstants.ISOFORM_GROUPS, DataConstants.ISOFORM_PARAMS, DataConstants.ISOFORM_PARAMS_VALUE_ALTERNATIVE, DataConstants.ISOFORM_COMMENT_CHAR) # Parse the protein domain file of SMART DB input_file = PropertyManager.get_instance().get_property( DataConstants.PROTEIN_DOMAIN_SMART_PROPERTY, True) self.launch_insertion_TSV( input_file, True, DataConstants.PROTEIN_DOMAIN_HEADERS_SMART, DataConstants.PROTEIN_DOMAIN_CLASS, DataConstants.PROTEIN_DOMAIN_PARAM_SMART, DataConstants.PROTEIN_DOMAIN_VALUE_SMART, DataConstants.PROTEIN_DOMAIN_COMMENT_CHAR, "SMART", False) # Parse the protein domain file of PFAM DB input_file = PropertyManager.get_instance().get_property( DataConstants.PROTEIN_DOMAIN_PFAM_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.PROTEIN_DOMAIN_HEADERS_PFAM, DataConstants.PROTEIN_DOMAIN_CLASS, DataConstants.PROTEIN_DOMAIN_PARAM_PFAM, DataConstants.PROTEIN_DOMAIN_VALUE_PFAM, DataConstants.PROTEIN_DOMAIN_COMMENT_CHAR, "PFAM", False) #=================================================================== # FUNCTION AND PATHWAY ANNOTATIONS #=================================================================== # Parse the Gene Ontology file input_file = PropertyManager.get_instance().get_property( DataConstants.GENE_ONTOLOGY_DEFINITION_PROPERTY, True) self.launch_insertion_Obo( input_file, DataConstants.GENE_ONTOLOGY_CLASS, DataConstants.GENE_ONTOLOGY_ID_TAG, DataConstants.GENE_ONTOLOGY_NAME_TAG, DataConstants.GENE_ONTOLOGY_NAMESPACE_TAG) # Parse the Protein Gene Ontology annotation file input_file = PropertyManager.get_instance().get_property( DataConstants.GENE_ONTOLOGY_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.PROTEIN_GO_ANNOTATION_HEADERS, DataConstants.PROTEIN_GO_ANNOTATION_CLASS, DataConstants.PROTEIN_GO_ANNOTATION_PARAMS, None, DataConstants.PROTEIN_GO_ANNOTATION_COMMENT_CHAR) # Parse the KEGG pathway file input_file = PropertyManager.get_instance().get_property( DataConstants.KEGG_PATHWAY_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, True, DataConstants.KEGG_PATHWAY_HEADERS, DataConstants.KEGG_PATHWAY_CLASS, DataConstants.KEGG_PATHWAY_PARAMS, None, DataConstants.KEGG_PATHWAY_COMMENT_CHAR) # Parse the Protein KEGG Pathway annotation file input_file = PropertyManager.get_instance().get_property( DataConstants.KEGG_PATHWAY_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, True, DataConstants.KEGG_PATHWAY_ANNOTATION_HEADERS, DataConstants.KEGG_PATHWAY_ANNOTATION_CLASS, DataConstants.KEGG_PATHWAY_ANNOTATION_PARAMS, None, DataConstants.KEGG_PATHWAY_ANNOTATION_COMMENT_CHAR) #=================================================================== # REACTOME #=================================================================== # Parse the Reactome pathway file input_file = PropertyManager.get_instance().get_property( DataConstants.REACTOME_PATHWAY_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.REACTOME_PATHWAY_HEADERS, DataConstants.REACTOME_PATHWAY_CLASS, DataConstants.REACTOME_PATHWAY_PARAMS, None, DataConstants.REACTOME_PATHWAY_COMMENT_CHAR) # Parse the Protein Reactome Pathway annotation file input_file = PropertyManager.get_instance().get_property( DataConstants.REACTOME_PATHWAY_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.REACTOME_PATHWAY_ANNOTATION_HEADERS, DataConstants.REACTOME_PATHWAY_ANNOTATION_CLASS, DataConstants.REACTOME_PATHWAY_ANNOTATION_PARAMS, None, DataConstants.REACTOME_PATHWAY_ANNOTATION_COMMENT_CHAR) #=================================================================== # BIOPLEX #=================================================================== # Parse the file listing Bioplex clusters input_file = PropertyManager.get_instance().get_property( DataConstants.BIOPLEX_CLUSTER_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.BIOPLEX_CLUSTER_HEADERS, DataConstants.BIOPLEX_CLUSTER_CLASS, DataConstants.BIOPLEX_CLUSTER_PARAMS, None, DataConstants.BIOPLEX_CLUSTER_COMMENT_CHAR) # Parse the file with Bioplex annotations input_file = PropertyManager.get_instance().get_property( DataConstants.BIOPLEX_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.BIOPLEX_ANNOTATION_HEADERS, DataConstants.BIOPLEX_ANNOTATION_CLASS, DataConstants.BIOPLEX_ANNOTATION_PARAMS, None, DataConstants.BIOPLEX_ANNOTATION_COMMENT_CHAR) #=================================================================== # WAN CLUSTERS #=================================================================== # Parse the file listing Wan clusters input_file = PropertyManager.get_instance().get_property( DataConstants.WAN_CLUSTER_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, False, DataConstants.WAN_CLUSTER_HEADERS, DataConstants.WAN_CLUSTER_CLASS, DataConstants.WAN_CLUSTER_PARAMS, None, DataConstants.WAN_CLUSTER_COMMENT_CHAR) # Parse the file with Wan annotations input_file = PropertyManager.get_instance().get_property( DataConstants.WAN_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.WAN_ANNOTATION_HEADERS, DataConstants.WAN_ANNOTATION_CLASS, DataConstants.WAN_ANNOTATION_PARAMS, None, DataConstants.WAN_ANNOTATION_COMMENT_CHAR) #=================================================================== # CORUM CLUSTERS #=================================================================== # Parse the file listing Corum clusters input_file = PropertyManager.get_instance().get_property( DataConstants.CORUM_CLUSTER_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, False, DataConstants.CORUM_CLUSTER_HEADERS, DataConstants.CORUM_CLUSTER_CLASS, DataConstants.CORUM_CLUSTER_PARAMS, None, DataConstants.CORUM_CLUSTER_COMMENT_CHAR) # Parse the file with Corum annotations input_file = PropertyManager.get_instance().get_property( DataConstants.CORUM_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.CORUM_ANNOTATION_HEADERS, DataConstants.CORUM_ANNOTATION_CLASS, DataConstants.CORUM_ANNOTATION_PARAMS, None, DataConstants.CORUM_ANNOTATION_COMMENT_CHAR) #=================================================================== # CUSTOM CLUSTERS #=================================================================== # Parse the file listing Custom clusters input_file = PropertyManager.get_instance().get_property( DataConstants.CUSTOM_CLUSTER_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.CUSTOM_CLUSTER_HEADERS, DataConstants.CUSTOM_CLUSTER_CLASS, DataConstants.CUSTOM_CLUSTER_PARAMS, None, DataConstants.CUSTOM_CLUSTER_COMMENT_CHAR) # Parse the file with Custom annotations input_file = PropertyManager.get_instance().get_property( DataConstants.CUSTOM_ANNOTATION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.CUSTOM_ANNOTATION_HEADERS, DataConstants.CUSTOM_ANNOTATION_CLASS, DataConstants.CUSTOM_ANNOTATION_PARAMS, None, DataConstants.CUSTOM_ANNOTATION_COMMENT_CHAR) #=================================================================== # INTERACTOME #=================================================================== # Parse the protein interaction file input_file = PropertyManager.get_instance().get_property( DataConstants.INTERACTOME_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, False, DataConstants.INTERACTOME_HEADER, DataConstants.INTERACTOME_CLASS, DataConstants.INTERACTOME_PARAMS, None, DataConstants.INTERACTOME_COMMENT_CHAR) # Parse the protein interaction network file input_file = PropertyManager.get_instance().get_property( DataConstants.INTERACTOME_NETWORK_DEFINITION_PROPERTY, True) ppi_default_values = [None, None, os.path.basename(input_file)] self.launch_insertion_TSV( input_file, False, DataConstants.INTERACTOME_NETWORK_HEADER, DataConstants.INTERACTOME_NETWORK_CLASS, DataConstants.INTERACTOME_NETWORK_PARAMS, ppi_default_values, DataConstants.INTERACTOME_NETWORK_COMMENT_CHAR) # Parse the Network Module file input_file = PropertyManager.get_instance().get_property( DataConstants. INTERACTOME_NETWORK_PARTITION_DEFINITION_PROPERTY, True) self.launch_insertion_NetworkModule( input_file, DataConstants.INTERACTOME_NETWORK_PARTITION_CLASS, DataConstants.INTERACTOME_NETWORK_PARTITION_CLASS_TAG, DataConstants.INTERACTOME_NETWORK_PARTITION_COMMENT_CHAR) # Parse the Network Module Annotation file input_file = PropertyManager.get_instance().get_property( DataConstants. INTERACTOME_NETWORK_PARTITION_ANNOTATION_PROPERTY, True) self.launch_insertion_NetworkModuleAnnotation( input_file, DataConstants.INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS, DataConstants. INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS_TAG, DataConstants. INTERACTOME_NETWORK_PARTITION_ANNOTATION_CLASS_REGEX, DataConstants. INTERACTOME_NETWORK_PARTITION_ANNOTATION_PROTEIN_TAG, DataConstants. INTERACTOME_NETWORK_PARTITION_ANNOTATION_ANNOTATION_TAG, DataConstants.INTERACTOME_NETWORK_PARTITION_COMMENT_CHAR) # Parse the protein redundancy file input_file = PropertyManager.get_instance().get_property( DataConstants. INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_PROPERTY, True) interactome_network_redundancy_definition_value = [ None, basename(input_file), None ] self.launch_insertion_TSV( input_file, False, DataConstants. INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_HEADERS, DataConstants.INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_CLASS, DataConstants.INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_PARAMS, interactome_network_redundancy_definition_value, DataConstants. INTERACTOME_NETWORK_REDUNDANCY_DEFINITION_COMMENT_CHAR, "Redundancy", False) #=================================================================== # RNA DEFINITION #=================================================================== # Make query of specific type of protein cross references to speed up insertion DataManager.get_instance().perform_query( DataConstants.PROTEIN_ENSP_XREF_KW, "query( ProteinCrossReference.protein_id,ProteinCrossReference.crossReferenceID ).filter(ProteinCrossReference.sourceDB == DataConstants.PROTEIN_ENSP_XREF_DB).all()" ) # Convert query into a dictionary DataManager.get_instance().query_to_dict( DataConstants.PROTEIN_ENSP_XREF_KW, 1, 0) # Parse the RNA file input_file = PropertyManager.get_instance().get_property( DataConstants.RNA_DEFINITION_PROPERTY, True) self.launch_insertion_TSV(input_file, True, DataConstants.RNA_HEADERS, DataConstants.RNA_CLASS, DataConstants.RNA_PARAMS, None, DataConstants.RNA_COMMENT_CHAR) # Parse the RNA cross references file input_file = PropertyManager.get_instance().get_property( DataConstants.RNA_CROSS_REFERENCE_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.RNA_CROSS_REFERENCE_HEADERS, DataConstants.RNA_CROSS_REFERENCE_CLASS, DataConstants.RNA_CROSS_REFERENCE_PARAMS, None, DataConstants.RNA_CROSS_REFERENCE_COMMENT_CHAR) #=================================================================== # RNA TISSUE EXPRESSION #=================================================================== # Make query of all RNA IDs to speed up insertion DataManager.get_instance().perform_query(DataConstants.RNA_ALL_KW, "query( RNA ).all()") # Format query into dict data structure DataManager.get_instance().query_to_object_dict( DataConstants.RNA_ALL_KW, "transcriptID") # Make query of all Protein IDs (uniprotAC) to speed up insertion DataManager.get_instance().perform_query(DataConstants.PROT_ALL_KW, "query( Protein ).all()") # Format query into dict data structure DataManager.get_instance().query_to_object_dict( DataConstants.PROT_ALL_KW, "uniprotAC") # Parse the RNA tissue expression file input_file = PropertyManager.get_instance().get_property( DataConstants.RNA_TISSUE_EXPRESSION_PROPERTY, True) self.launch_insertion_TSV( input_file, True, DataConstants.RNA_TISSUE_EXPRESSION_HEADERS, DataConstants.RNA_TISSUE_EXPRESSION_CLASS, DataConstants.RNA_TISSUE_EXPRESSION_PARAMS, DataConstants.RNA_TISSUE_EXPRESSION_VALUE, DataConstants.RNA_TISSUE_EXPRESSION_COMMENT_CHAR) #=================================================================== # PROTEIN RNA INTERACTION #=================================================================== self.forceOverride = 1 # Parse the file listing RNA with catRAPID data input_file = PropertyManager.get_instance().get_property( DataConstants.INTERACTING_RNA_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, True, DataConstants.INTERACTING_RNA_DEFINITION_HEADERS, DataConstants.INTERACTING_RNA_DEFINITION_CLASS, DataConstants.INTERACTING_RNA_DEFINITION_PARAMS, None, DataConstants.INTERACTING_RNA_DEFINITION_COMMENT_CHAR) # Parse the file listing Proteins with catRAPID data input_file = PropertyManager.get_instance().get_property( DataConstants.INTERACTING_PROTEIN_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, True, DataConstants.INTERACTING_PROTEIN_DEFINITION_HEADERS, DataConstants.INTERACTING_PROTEIN_DEFINITION_CLASS, DataConstants.INTERACTING_PROTEIN_DEFINITION_PARAMS, None, DataConstants.INTERACTING_PROTEIN_DEFINITION_COMMENT_CHAR) # Initialize data items to store missing interactions if DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_RNA_KW not in DataManager.get_instance( ).data: DataManager.get_instance().store_data( DataConstants. PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_RNA_KW, []) if DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_PROT_KW not in DataManager.get_instance( ).data: DataManager.get_instance().store_data( DataConstants. PROTEIN_RNA_INTERACTION_CATRAPID_MISSING_PROT_KW, []) # Parse the ProteinRNAInteractionCatRAPID file input_file = PropertyManager.get_instance().get_property( DataConstants. PROTEIN_RNA_INTERACTION_CATRAPID_DEFINITION_PROPERTY, True) self.launch_insertion_TSV( input_file, False, DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_HEADERS, DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_CLASS, DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_PARAMS, None, DataConstants.PROTEIN_RNA_INTERACTION_CATRAPID_COMMENT_CHAR) self.forceOverride = 0 # Remove data that will no longer be used to reduce memory usage DataManager.get_instance().delete_data( DataConstants.PROTEIN_ENSP_XREF_KW) DataManager.get_instance().delete_data(DataConstants.RNA_ALL_KW) DataManager.get_instance().delete_data(DataConstants.PROT_ALL_KW) except RainetException as re: Logger.get_instance().error(re.to_string()) Timer.get_instance().stop_chrono("ERROR : Data insertion FAILED") return # # Report on potential missing data # self.check_missing_data() # Stop the chrono Timer.get_instance().stop_chrono("Data insertion finished")