def merge_dsorftranscriptasso(self): Logger.get_instance().info( 'Starting to merge the entries of the DSORFTranscriptAsso table.') # Get all the existing ( ORF, Transcript ) couples for which this # is necessary to merge all the related DSORFTranscriptAsso # If they have not yet been computed, then do it if (self.resume_at_stage in [ ResumeMergeStrategy.RESUME_AFTER_CONSERVED, ResumeMergeStrategy.RESUME_AFTER_ORF, ResumeMergeStrategy.RESUME_AFTER_TRANSCRIPT ]): self.get_dsorftranscriptasso_to_merge() # Otherwise import them from the .dcorf file elif (self.resume_at_stage in [ ResumeMergeStrategy.RESUME_AFTER_OTA_ID_ASSO, ResumeMergeStrategy.RESUME_DURING_OTA ]): self.import_dsorftranscriptasso_to_merge() # Any other case that is not handle should raise a programming exception else: raise DenCellORFException( 'ResumeMergeStrategy.merge_dsorftranscriptasso(): The case ' + self.resume_at_stage + ' is not properly handled in the method!' + ' Please contact the developer if you see this message.') # Merge the entries of the DSORFTranscriptAsso table self.merge_dsota()
def execute(self): # For each model of database, if the strategy selected by the user requires to # check / build the database, instantiate the appropriate CheckDatabase class # and execute the CheckDatabase strategy. for db_model in self.DATABASE_MODELS: strategies_checking_db = eval( 'OptionConstants.STRATEGIES_CHECKING_' + db_model + '_DATABASE') if (self.called_strategy in strategies_checking_db): Logger.get_instance().info('Checking the ' + db_model + ' database...') try: check_db = eval('Check' + db_model + 'Database()') except Exception as e: raise DenCellORFException( 'DatabaseCheckStrategy: An exception occurred trying to' + ' instantiate Check' + db_model + 'Database.', e) try: check_db.execute() except DenCellORFException as e: raise DenCellORFException( 'DatabaseCheckStrategy: An exception occurred trying to' + ' check the ' + db_model + 'database.', e)
def get_obj_from_file( input_folder, filename ): file_path = os.path.join( input_folder, filename ) + Constants.DENCELLORF_FILES_EXTENSION # Check the file exists if not os.path.exists( file_path ): raise DenCellORFException( 'FileHandlerUtil.get_obj_from_file(): No file has been found' + ' at the path provided (' + file_path + ').' + ' Please note that some strategies may only be run in certain' + ' particular cases (e.g. the Restore strategy may only be run' + ' after a successful run of the Backup strategy).' + ' Please see the documentation for more information.' ) # Get the list of objects from the file Logger.get_instance().debug( 'FileHandlerUtil.get_obj_from_file(): Importing data from ' + file_path + '...' ) try: with open( file_path, 'rb' ) as saved_objects_file: list_of_obj_unpickler = pickle.Unpickler( saved_objects_file ) list_of_objects = list_of_obj_unpickler.load() except Exception as e: raise DenCellORFException( 'FileHandlerUtil.get_obj_from_file(): An error occurred trying' + ' to get the objects saved in ' + file_path + '. Hence, these data from this file will not be loaded.' + ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e ) else: Logger.get_instance().debug( 'FileHandlerUtil.get_obj_from_file(): ' + str( len( list_of_objects ) ) + ' objects have been successfully loaded from ' + file_path + '.' ) return list_of_objects
def save_obj_to_file( objects_to_save, filename, output_folder=DefaultOutputFolder.OUTPUT_FOLDER ): # Create the output folder if it does not yet exist # (and its parent folders if necessary) if not os.path.isdir( output_folder ): os.makedirs( output_folder ) file_path = os.path.join( output_folder, filename ) + Constants.DENCELLORF_FILES_EXTENSION Logger.get_instance().debug( 'FileHandlerUtil.save_obj_to_file(): ' + str( len( objects_to_save ) ) + ' objects will be saved in ' + file_path + '.' ) # Save the objects in the file try: with open( file_path, 'wb' ) as objects_to_save_file: obj_to_insert_pickler = pickle.Pickler( objects_to_save_file ) obj_to_insert_pickler.dump( objects_to_save ) except Exception as e: raise DenCellORFException( 'FileHandlerUtil.save_obj_to_file(): An error occurred trying' + ' to save the objects in ' + file_path + '. Hence, these data will not be saved.' + ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e ) else: Logger.get_instance().debug( 'FileHandlerUtil.save_obj_to_file(): ' + str( len( objects_to_save ) ) + ' objects have been successfully saved in ' + file_path + '. This file may be used to recover data later.' + ' Please see the documentation for more information.')
def execute(self): # Create a session to the DS database SQLManagerDS.get_instance().set_db_settings(self.db_settings) try: SQLManagerDS.get_instance().get_session() except Exception as e: raise DenCellORFException( 'ForceInsertionStrategy.execute(): An error occurred while trying to' + ' create a session to the database.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) # Get the list of entries in the DataSource table and store them in the DataManager DataManager.get_instance().store_DS_query_result( Constants.DM_ALL_DATASOURCES, 'query(DataSource).all()') SQLManagerDS.get_instance().close_session() # Process to the insertion of data Logger.get_instance().info('Starting the insertion of data.') for data_source in self.datasource: try: self.insert_datasource(data_source) except Exception as e: raise DenCellORFException( 'An error occurred trying to insert the data related to ' + data_source + '.', e) # Log the end of the insertion Logger.get_instance().info('The insertion of data has finished.')
def pandas_df_to_csv( output_folder, filename, df, file_desc='', sep=',', ext='.csv', \ hdr=True, idx=False, mode='w', encoding='utf-8' ): # Create the output folder if it does not yet exist # (and its parent folders if necessary) if not os.path.isdir( output_folder ): os.makedirs( output_folder ) file_path = os.path.join( output_folder, filename ) + ext Logger.get_instance().debug( 'FileHandlerUtil.pandas_df_to_csv(): ' + ' The data frame (' + file_desc + ') will be saved in ' + file_path + '.' ) # Save the objects in the file try: df.to_csv( file_path, sep = sep, header = hdr, index = idx, mode = mode, encoding = encoding ) except Exception as e: raise DenCellORFException( 'FileHandlerUtil.pandas_df_to_csv(): An error occurred trying to' + ' save the pandas dataframe in ' + file_path + ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e ) else: Logger.get_instance().debug( 'FileHandlerUtil.pandas_df_to_csv(): ' + ' The data frame (' + file_desc + ') has been successfully saved in ' + file_path + '.' )
def import_dsorftranscriptasso_to_merge(self): Logger.get_instance().debug( 'ResumeMergeStrategy.get_dsorftranscriptasso_to_merge():' + ' Getting the dictionary that associate to each existing' + ' ( ORF ID, Transcript ID ) couples the list of DSORFTranscriptAsso' + ' (DS) IDs and converting lists of IDs into list of objects.') # Get the dictionary that associates to each unique ( ORF ID (PRO), # Transcript ID (PRO) ) couple that exists the list of the IDs of # all the DSORFTranscriptAsso (DS) that are related to it all_existing_orf_tr_asso_ids = FileHandlerUtil.get_obj_from_file( input_folder=Constants.MERGED_DATA_FOLDER, filename=Constants.ALL_EXISTING_ORF_TR_ASSO_IDS_FILENAME) # Convert the dictionary in order to change the list of IDs # by lists of corresponding DSORFTranscriptAsso objects all_dsota = SQLManagerDS.get_instance().get_session().query( DSORFTranscriptAsso).all() SQLManagerDS.get_instance().close_session() all_dsota_dict = {} for dsota in all_dsota: all_dsota_dict[dsota.id] = dsota existing_orf_tr_asso_all = {} for ((orf_id, tr_id), dsota_ids_list) in all_existing_orf_tr_asso_ids.items(): existing_orf_tr_asso_all[(orf_id, tr_id)] = map( lambda x: all_dsota_dict.get(x), dsota_ids_list) # If the merging of DSORFTranscriptAsso entries has already # been started once and failed, then remove from the list # all the (ORF ID, Transcript ID) couples already processed if (self.resume_at_stage == ResumeMergeStrategy.RESUME_DURING_OTA): all_processed_ota = SQLManagerPRO.get_instance().get_session( ).query(ORFTranscriptAsso).all() SQLManagerPRO.get_instance().close_session() all_processed_ids = [(ota.orf_id, ota.transcript_id) for ota in all_processed_ota] existing_orf_tr_asso_all_to_process = { key : val \ for ( key, val ) in existing_orf_tr_asso_all.items() \ if ( ( int( key[0] ), int( key[1] ) ) not in all_processed_ids ) } existing_orf_tr_asso_all = existing_orf_tr_asso_all_to_process Logger.get_instance().debug( 'ResumeMergeStrategy.import_dsorftranscriptasso_to_merge(): ' + str(len(all_processed_ota)) + ' couples have already been' + ' processed and ' + str(len(existing_orf_tr_asso_all.keys())) + ' remains to be processed.') # Store the dictionary in the DataManager main dictionary DataManager.get_instance().store_data( Constants.DM_ALL_EXISTING_ORF_TR_ASSO_DICT, existing_orf_tr_asso_all)
def execute(self): # Set the connection to the database self.get_sqlmanager_instance().set_db_settings(self.db_settings) # Check the integrity of the database str_ok = self.get_sqlmanager_instance().check_database_str_integrity() if (not str_ok): raise DenCellORFException( 'The schema of the database provided does not follow' + ' the expected model. Please make sure the provided model (' + self.db_model + ') and the database (' + self.db_settings[Constants.DB_SETTINGS_DB_NAME] + ') provided are the right ones.') # Get the declarative base corresponding to the database base = self.get_sqlmanager_instance().get_declarative_base() # Build a dictionary of the classes defined in the model # where the keys are the classes, and the values their names dict_model_classes = {} for (cl_name, cl_object) in base._decl_class_registry.items(): if (not str(cl_name) == '_sa_module_registry'): dict_model_classes[cl_object] = str(cl_name) # For each table, get the list of all entries and save them in a file for table in dict_model_classes.keys(): # Get the name of the tale table_name = str(dict_model_classes[table]) Logger.get_instance().debug( 'Starting to save the entries of the ' + table_name + ' table.') # Get all the entries to save objects_to_save = self.get_sqlmanager_instance().get_session( ).query(table).all() # Expunge the session to the database to detach the objects in the list from the session self.get_sqlmanager_instance().get_session().expunge_all() self.get_sqlmanager_instance().close_session() if self.file_prefix: filename = self.file_prefix + table_name else: filename = table_name try: FileHandlerUtil.save_obj_to_file( objects_to_save=objects_to_save, filename=filename, output_folder=self.output_folder) except Exception as e: raise DenCellORFException( 'BackupStrategy.execute(): An error occurred trying to' + ' save data in the file.' '\n Error code: ' + LogCodes.ERR_FILEHAND + '.', e)
def initialize(self): # Get the main keyword that defines the strategy self.strategy = sys.argv[1] # If the strategy is not known, check if the user asked the help. # Otherwise, raise a DenCellORFException. if (self.strategy not in OptionConstants.STRATEGIES_LIST): # Display help on the console if necessary and exit the program if (self.strategy in ['-h', '--help']): print( 'To run a strategy, you need to type the command such as: \n' + 'python $PYTHONPATH/fr/tagc/uorf/uorf.py [StrategyKeyword] [Options] \n' + 'or DenCellORF [StrategyKeyword] [Options]. \n' 'The following strategies are available: ' + ', '.join(OptionConstants.STRATEGIES_LIST) + '.\n' ' You may find more information about the options available for each strategy' + ' using the command DenCellORF [StrategyKeyword] -h' + ' or DenCellORF [StrategyKeyword] --help. \n' + ' For extensive information, please read the user manual (PDF file).' ) exit() else: raise DenCellORFException( 'The strategy selected (' + self.strategy + ') is not correct.' + ' It must be one of ' + ', '.join(OptionConstants.STRATEGIES_LIST) + '. Please see the documentation for more information.') Logger.get_instance().info('---') Logger.get_instance().info('Selected strategy: ' + self.strategy) # Build an option parser to collect the option values self.optionParser = OptionParser() for current_prop_list in OptionConstants.OPTION_LIST[self.strategy]: self.optionParser.add_option(current_prop_list[0], current_prop_list[1], action=current_prop_list[2], type=current_prop_list[3], dest=current_prop_list[4], default=current_prop_list[5], help=current_prop_list[6]) # Get the various option values into a dictionary (opts, args) = self.optionParser.parse_args() self.optionDict = vars(opts) self.args = args # Log the settings Logger.get_instance().info('Settings:') for opt in self.optionDict.items(): Logger.get_instance().info("-" + str(opt[0]) + ": '" + str(opt[1]) + "'") Logger.get_instance().info('---')
def execute( self ): # Set the connection to the database self.get_sqlmanager_instance().set_db_settings( self.db_settings ) try: self.get_sqlmanager_instance().get_session() except Exception as e: raise DenCellORFException( 'AddReleaseVersionStrategy.execute(): An error occurred trying to' + ' create a session to the database.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) self.get_sqlmanager_instance().close_session() # If there is already an annotation version information registered # in the metadata table, get it release_metadata_query = self.get_sqlmanager_instance().get_session().query( self.get_metadata_class() ).filter( self.get_metadata_class().parameter == Constants.METATABLE_DATABASE_VERSION_NUMBER ) # If a version has already been registered, update it if necessary # (i.e. if the forceOverwrite option has been selected) or log a # critical message, if ( release_metadata_query.count() != 0 ): # Get the entry metadata_release_entry = release_metadata_query.one() if ( self.force_overwrite ): metadata_release_entry.value = self.db_release metadata_release_entry.description = self.db_desc # Commit the updates self.get_sqlmanager_instance().commit() else: if ( ( metadata_release_entry.value != self.db_release ) or ( metadata_release_entry.description != self.db_desc ) ): Logger.get_instance().critical( 'A different version has already been registered in the metadata' + ' table for this database (' + metadata_release_entry.value + ', ' + metadata_release_entry.description + ') and the forceOverwrite option has not been selected.' + ' Hence, the version number and/or description will not' + ' be updated.' ) # Otherwise, create a new entry in the metadata table else: metadata_release = self.get_metadata_class()( parameter = Constants.METATABLE_DATABASE_VERSION_NUMBER, value = self.db_release, description = self.db_desc ) self.get_sqlmanager_instance().get_session().add( metadata_release ) self.get_sqlmanager_instance().commit() self.get_sqlmanager_instance().close_session()
def dict_to_csv( output_folder, filename, dict, file_desc='', sort=False, sep=',', ext='.csv', \ hdr=None, key_func=lambda k: k, val_func=lambda v: v, unlist_key=False, unlist_val=False ): # Create the output folder if it does not yet exist # (and its parent folders if necessary) if not os.path.isdir( output_folder ): os.makedirs( output_folder ) file_path = os.path.join( output_folder, filename ) + ext Logger.get_instance().debug( 'FileHandlerUtil.dict_to_csv(): The content of the dictionary (' + file_desc + ') will be saved in ' + file_path + '.' ) # Save the objects in the file try: with open( file_path, 'wb' ) as csv_file: writer = csv.writer( csv_file, delimiter = sep ) # Write the header if necessary if hdr: writer.writerow( hdr ) # Write the dictionary as key, value if sort: key_list = sorted( dict.keys() ) else: key_list = dict.keys() for k in key_list: # Get the value and apply the functions to transform the key # and value if necessary key = key_func( k ) val = val_func( dict.get( k ) ) # Write the new row in the file if ( ( not unlist_key ) or ( not isinstance( key, list ) ) ): key = [ key ] if ( ( not unlist_val ) or ( not isinstance( val, list ) ) ): val = [ val ] writer.writerow( key + val ) except Exception as e: raise DenCellORFException( 'FileHandlerUtil.dict_to_csv(): An error occurred trying to save' + ' the content of the dictionary in ' + file_path + '\n Error code: ' + LogCodes.ERR_FILEHAND + '.', e ) else: Logger.get_instance().debug( 'FileHandlerUtil.dict_to_csv(): ' + ' the content of the dictionary (' + file_desc + ') has been successfully saved in ' + file_path + '. Please see the documentation for more information.' )
def __init__( self ): configfile = OptionManager.get_instance().get_option( OptionConstants.OPTION_CONFIG_FILE_PATH, not_none = True ) if configfile: self.configfile = configfile if ( not os.path.exists( configfile ) ): raise DenCellORFException( 'No config file may be found at the path provided (' + self.configfile + ').' ) else: raise DenCellORFException( 'A config file has to be provided.' + ' See the documentation for more information.' ) # Check if the forceOverwrite option has been selected if OptionManager.get_instance().get_option( OptionConstants.OPTION_FORCE_OVERWRITE, not_none = False ): self.force_overwrite = True else: self.force_overwrite = False self.species = None self.ensembl_release_version = None # Get the number of threads available self.thread_nb = OptionManager.get_instance().get_option( OptionConstants.OPTION_THREAD_NB, not_none = False ) available_thread_nb = cpu_count() if self.thread_nb: try: self.thread_nb = int( self.thread_nb ) except: raise DenCellORFException( 'ComputeRelCoordStrategy: The value provided for the number' ' of threads needs to be an integer (provided value: ' + str( self.thread_nb ) + ').' ) else: if ( self.thread_nb < 1 ): raise DenCellORFException( 'ComputeRelCoordStrategy: The value provided for the number' + ' of threads needs to be an integer greater than 1 (provided value: ' + str( self.thread_nb ) + ').' ) if ( self.thread_nb > available_thread_nb ): Logger.get_instance().info( 'The number of threads provided (' + str( self.thread_nb ) + ') is greater than the number of threads actually' + ' available(' + str( available_thread_nb ) + '). Hence, ' + str( available_thread_nb ) + ' threads will be used for the computation.' ) else: self.thread_nb = available_thread_nb Logger.get_instance().debug( 'ComputeRelCoordStrategy: ' + str( self.thread_nb ) + ' threads' + ' will be used for the computation of relative coordinates.' )
def __init__(self, log_path=Constants.PATH_GENEREF_LOG, writing_mode=Constants.GENEREF_LOG_DEFAULT): self.logg = GeneRefLogger.set_logger(log_path, writing_mode) # Log the instantiation of this logger in the main logger Logger.get_instance().warning( 'A warning related to gene references has been raised' + ' during the execution of the program. All warnings related' + ' to the gene references will be logged in the file "' + str(log_path) + '". Please see the documentation for' + ' more information.')
def init_log_file(self, ext='.tsv'): # Create the output folder if it does not yet exist # (and its parent folders if necessary) if (not os.path.isdir(self.output_folder)): os.makedirs(self.output_folder) file_path = os.path.join(self.output_folder, self.file_name + ext) Logger.get_instance().info(' The logs will be saved in ' + file_path + '.') file = open(file_path, mode='w') file.close()
def build_sqlite_database(self, force_overwrite): # If there is already a database at the path if os.path.exists(self.db_path): # And the forceOverwrite option has been selected, # then remove the database file if force_overwrite: self.remove_sqlite_db() return True else: # Check if the existing database contains the # appropriate model if not self.check_database_str_integrity(): # Ask the user to confirm the deletion of the database confirm_deletion = None Logger.get_instance().info( 'The database provided does not use the appropriate' + ' model. Hence, the database will be removed and build' + ' again using the right model.') while (confirm_deletion not in ['Y', 'N']): print( ' Do you want to confirm the deletion of the database? (Y/N)' ) confirm_deletion = raw_input().upper() # If the user refuse the database to be deleted, log a critical error if (confirm_deletion == 'N'): Logger.get_instance().critical( 'As the database does not use the appropriate' + ' model and the deletion has been canceled by the' + ' user, the program will be stopped. Please see' + ' the documentation for more information.') # Otherwise delete the database and create a new one self.remove_sqlite_db() return True else: return False else: return True
def add_and_commit(self, objects_to_add, process='Undefined process'): # Get the number of objects that are expected to be inserted in the database total_count = len(objects_to_add) # Add the objects to the session try: self.get_session().add_all(objects_to_add) except Exception as e: # Get the number of objects of each type in the list types_dict = GeneralUtil.get_type_counts_in_list(objects_to_add) types_dict_str = ', '.join([ str(tp) + ': ' + str(val) for (tp, val) in types_dict.items() ]) raise DenCellORFException( self.classname + '.add_and_commit():' + ' An error occurred trying to add ' + str(total_count) + ' objects (from ' + process + ') to the session.' + ' The list was containing the following objects: ' + types_dict_str + '.', e) # Commit changes try: self.commit() except Exception as e: # Get the number of objects of each type in the list types_dict = GeneralUtil.get_type_counts_in_list(objects_to_add) types_dict_str = ', '.join([ str(tp) + ': ' + str(val) for (tp, val) in types_dict.items() ]) raise DenCellORFException( self.classname + '.add_and_commit():' + ' An error occurred trying to commit changes after addition of ' + str(total_count) + ' objects (from ' + process + ') to the session.' + ' The list was containing the following objects: ' + types_dict_str + '.', e) # Log in debug mode the number of objects successfully inserted Logger.get_instance().debug( self.classname + '.add_and_commit(): ' + str(total_count) + ' objects (from ' + process + ') have been successfully added to the database.')
def store_DS_query_result(self, keyword, query_string): # Perform the query Logger.get_instance().debug('DataManager.store_DS_query_result():' + ' Querying the DS database: ".' + query_string + '".') try: query_result = eval('SQLManagerDS.get_instance().get_session().' + query_string) except Exception as e: raise DenCellORFException( 'DataManager.store_DS_query_result(): The query ".' + query_string + '" failed.', e) # Convert the result of the query into a dictionary where the keys equal # the values and each of them are one of the result in the list query_result = GeneralUtil.list_to_dict(query_result) # Store the dictionary in the data dictionary self.data[keyword] = query_result
def remove_sqlite_db(self): if os.path.exists(self.db_path): try: remove(self.db_path) except Exception as e: raise DenCellORFException('The database located at ' + str(self.db_path) + ' cannot be deleted.') else: Logger.get_instance().info('The database file located at ' + str(self.db_path) + ' has been deleted.') else: Logger.get_instance().error( self.classname + '.remove_sqlite_db(): There is no file' + ' located at' + str(self.db_path) + '.' + ' Error code: ' + LogCodes.ERR_SQL_FILE + '.', ex=False)
def build_database(self, db_settings, species, sp_mandatory=True, force_overwrite=False): # Store the settings necessary to establish the connection self.set_db_settings(db_settings) # Check that a species is provided if ((sp_mandatory) and ((species == None) or (len(species) == 0))): raise DenCellORFException( self.classname + '.set_db_settings(): A species needs to be provided!') # Get the engine to dedicated database self.create_engine() # Check and / or remove the existing database if necessary if (self.db_type == SQLConstants.DB_TYPE_SQLITE): reset_model = self.build_sqlite_database(force_overwrite) # Check and / or remove the existing database if necessary # and / or create the database on the server if necessary elif (self.db_type == SQLConstants.DB_TYPE_MYSQL): reset_model = self.build_mysql_database(force_overwrite) # Open a session self.create_session() # If the model does not yet exists, create all the required tables if reset_model: self.BASE.metadata.create_all(self.engine) Logger.get_instance().info('The database ' + self.db_path + ' has been created.') else: Logger.get_instance().info('The database ' + self.db_path + ' will be used.') self.session.close()
def get_ensembl_db( sp, annotation_version ): Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading and indexing the Ensembl' + ' database release ' + str( annotation_version ) + ' for ' + sp + '.' ) ensembl_db = EnsemblRelease( release = annotation_version, species = sp ) # Download and index the database if not yet in the temporary folder Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading the Ensembl' + ' database release ' + str( annotation_version) + ' for ' + sp + '.' ) try: ensembl_db.download() except Exception as e: raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' + ' download the Ensembl database using pyensembl.', e ) Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Indexing the Ensembl' + ' database release ' + str( annotation_version) + ' for ' + sp + '.' ) try: ensembl_db.index() except Exception as e: raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' + ' index the Ensembl database using pyensembl.', e ) return ensembl_db
def batch_insert_to_db(self, objects_to_insert, source): Logger.get_instance().debug('Starting the insertion of data from ' + source + '.') # Save into a temporary file the data that should be inserted. # This allows to recover the data later if an exception is raised during # the insertion, saving thus the parsing time. try: FileHandlerUtil.save_obj_to_file( objects_to_save=objects_to_insert, filename='objects_from_' + source, output_folder=Constants.PARSED_DATA_FOLDER) except Exception as e: Logger.get_instance().error( 'InsertionStrategy.batch_insert_to_db():' + ' An error occurred trying to save data from ' + source + ': \n' + str(e) + ' Error code: ' + LogCodes.ERR_FILEHAND + '.', ex=False) # Insert the objects into the database SQLManagerDS.get_instance().batch_insert_to_db( objects_to_insert=objects_to_insert, process=source) Logger.get_instance().debug('The insertion of data from ' + source + ' has finished.')
def prepare_r_annotation_package( species_short_name, species_full_name, species_common_name, \ ensembl_release_version ): # Define temporary folder where to install the packages annot_package_dir = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 'R_ensembl_annot_packages', species_common_name + str( ensembl_release_version ) ) if ( not os.path.exists( annot_package_dir ) ): os.makedirs( annot_package_dir ) # Run the R script (as a Python subprocess) # to prepare the packages r_args = [ '--ensemblRelease=' + str( ensembl_release_version ), '--speciesFullName=' + species_full_name, '--speciesShortName=' + species_short_name, '--speciesCommonName=' + species_common_name, '--annotPackageDir=' + annot_package_dir ] r_command = [ 'Rscript', ComputeRelCoordStrategy.R_SCRIPT_BUILD_ANNOT_PACKAGE_PATH ] + r_args Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package(): The R script' + ' will be run with the following arguments ' + ' '.join( r_args ) ) r_process = subprocess.Popen( r_command, stdout = subprocess.PIPE, stderr = subprocess.PIPE ) ( stdout, stderr ) = r_process.communicate() if ( stdout != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package():' + ' The R script returned the following standard output: \n' + stdout ) if ( stderr != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package():' + ' The R script returned the following error output: \n' + stderr )
def execute(self): strategy_command = OptionManager.get_instance().get_strategy() if (strategy_command != None): try: strategy = eval(strategy_command + 'Strategy()') except Exception as e: raise DenCellORFException( 'DenCellORF.execute(): An error occurred during the' + ' instantiation of the strategy: ' + str(e)) else: Logger.get_instance().critical( 'DenCellORF.execute(): A strategy must be provided!' + ' The following strategies are available: ' + ', '.join(OptionConstants.STRATEGIES_LIST) + '. See the documentation for more information.') try: strategy.execute() except Exception as e: raise DenCellORFException( 'DenCellORF.execute(): An error occurred during the execution' + ' of the program.', e)
def insert_datasource(data_source): # For an easier manipulation of data stored in the DataManager, # assign a new variable to access the list of data sources all_datasources = DataManager.get_instance().get_data( Constants.DM_ALL_DATASOURCES) # Make sure the source is not already in the database ds = DataSource(name=data_source) # If the source is in the database, do not proceed to the insertion if ds in all_datasources: Logger.get_instance().info( 'The source "' + data_source + '" has been found in the database.' + ' Hence, the data from this source will not be inserted again.' + ' If for some reason you need to perform again the insertion,' + ' please first use the Deletion strategy.' + ' Please see the documentation for more information.') # Process to the insertion of data else: Logger.get_instance().debug( 'Starting the insertion of data from ' + data_source + '.') # Get the list of objects to insert from the file try: objects_to_insert = FileHandlerUtil.get_obj_from_file( input_folder=Constants.PARSED_DATA_FOLDER, filename='objects_from_' + data_source) except Exception as e: raise DenCellORFException( 'An error occurred trying to import the data for the source ' + data_source + ' from its file.' + '\n Error code: ' + LogCodes.ERR_FILEHAND + '.', e) else: try: SQLManagerDS.get_instance().batch_insert_to_db( objects_to_insert=objects_to_insert, process=data_source) except DenCellORFException as e: raise DenCellORFException( 'An error occurred trying to insert the data from ' + data_source + '.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) Logger.get_instance().info('The insertion of data from ' + data_source + ' finished.')
def execute( self ): # Create a session to the "PRO-like" database SQLManagerPRO.get_instance().set_db_settings( self.db_settings ) try: SQLManagerPRO.get_instance().get_session() except Exception as e: raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred trying to' + ' create a session to the database.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) SQLManagerPRO.get_instance().close_session() Logger.get_instance().info( 'Starting to build the track file.' ) # Track header and settings # ------------------------- # Define track labels sp = SQLManagerPRO.get_instance().get_session().query( PROSpeciesCatalog.name ).one()[0] current_annotation = SQLManagerPRO.get_instance().get_session().query( PROMetadata.value ).filter( PROMetadata.parameter == Constants.METATABLE_CURRENT_ANNOTATION ).one()[0] current_ucsc_annot = Constants.CORRESPONDING_UCSC_FROM_NCBI[ current_annotation ] SQLManagerPRO.get_instance().close_session() track_track = '{project_name}_{species}_{annotation}'.format( project_name = Constants.PROJECT_NAME, species = sp, annotation = current_ucsc_annot ) track_track = 'track ' + track_track track_shortLabel = '{project_name}_{species}_{annotation}'.format( project_name = Constants.PROJECT_NAME, species = sp, annotation = current_ucsc_annot ) track_shortLabel = 'shortLabel ' + track_shortLabel track_longLabel = '{project_name} {species} track hub ({annotation}), See {db_url} for more information regarding this track'.format( project_name = Constants.PROJECT_NAME, species = sp, annotation = current_ucsc_annot, db_url = self.WEBSITE_URL ) track_longLabel = 'longLabel ' + track_longLabel # Path to HTML descriptive file track_html = 'html ' + self.TRACK_HTML_DESC_PATH # Path to BigBed file track_bigDataUrl = ( 'bigDataUrl ' + GenerateTrackDbFileStrategy.BIGBED_FILENAME + GenerateBEDFileStrategy.BIGBED_FILE_EXTENSION ) track_type = 'type bigBed 12 +' # Track visualization track_default_vis = ( 'visibility full\n' + 'thickDrawItem on\n' + 'itemRgb on\n' + 'maxItems 100000\n' + 'exonArrows on\n' + 'exonNumbers on' ) # Track filters # ------------- # Transcript IDs all_transcript_ids = SQLManagerPRO.get_instance().get_session().query( Transcript.transcript_id.distinct() ).all() all_transcript_ids = sorted( GeneralUtil.query_result_to_list( all_transcript_ids ) ) SQLManagerPRO.get_instance().close_session() transcripts_filter_values = ',\\\n'.join( all_transcript_ids ) track_filter_transcripts = ( 'filterType.transcripts multipleListOr\n' + 'filterText.transcripts *\n' + 'filterLabel.transcripts Transcript IDs\n' + 'filterValues.transcripts ' + transcripts_filter_values ) # RNA biotypes all_rna_biotypes = SQLManagerPRO.get_instance().get_session().query( Transcript.rna_biotype.distinct() ).filter( Transcript.rna_biotype != None ).all() all_rna_biotypes = sorted( GeneralUtil.query_result_to_list( all_rna_biotypes ) ) SQLManagerPRO.get_instance().close_session() rnabiotypes_filter_values = ',\\\n'.join( all_rna_biotypes ) track_filter_rnabiotypes = ( 'filterType.rna_biotypes multipleListOr\n' + 'filterText.rna_biotypes *\n' + 'filterLabel.rna_biotypes RNA biotypes\n' + 'filterValues.rna_biotypes ' + rnabiotypes_filter_values ) # Cell contexts all_cell_contexts = SQLManagerPRO.get_instance().get_session().query( CellContextCatalog.context ).all() all_cell_contexts = sorted( GeneralUtil.query_result_to_list( all_cell_contexts ) ) SQLManagerPRO.get_instance().close_session() celltypes_filter_values = ',\\\n'.join( all_cell_contexts ) track_filter_celltypes = ( 'filterType.cell_types multipleListOr\n' + 'filterText.cell_types *\n' + 'filterLabel.cell_types Cell types (cell lines, tissues...)\n' + 'filterValues.cell_types ' + celltypes_filter_values ) # ORF Annotations all_orfannotations = SQLManagerPRO.get_instance().get_session().query( ORFAnnotationCatalog.annotation ).all() all_orfannotations = sorted( GeneralUtil.query_result_to_list( all_orfannotations ) ) orfannotations_filter_values = ',\\\n'.join( all_orfannotations ) track_filter_orfannotations = ( 'filterType.orf_annotations multipleListOr\n' + 'filterText.orf_annotations *\n' + 'filterLabel.orf_annotations ORF Annotations\n' + 'filterValues.orf_annotations ' + orfannotations_filter_values ) # Kozak contexts all_kozak_ctxt_comp = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso.kozak_context_comp.distinct() ).filter( ORFTranscriptAsso.kozak_context_comp != None ).all() all_kozak_ctxt_comp = sorted( GeneralUtil.query_result_to_list( all_kozak_ctxt_comp ) ) kozakcontexts_filter_values = ',\\\n'.join( all_kozak_ctxt_comp ) track_filter_kozakcontexts = ( 'filterType.kozak_contexts multipleListOr\n' + 'filterText.kozak_contexts *\n' + 'filterLabel.kozak_contexts Computed Kozak context\n' + 'filterValues.kozak_contexts ' + kozakcontexts_filter_values ) # URLs # ---- url_name = 'name="' + self.WEBSITE_URL + '/ORF/' + self.WEBSITE_URL_SPECIES[ sp ] + '/$$' + '"' url_transcript = 'transcripts="' + self.WEBSITE_URL + '/transcript/' + self.WEBSITE_URL_SPECIES[ sp ] + '/$$' + '"' track_urls = 'urls ' + '\\\n'.join( [ url_name, url_transcript ] ) # Additional lines # ---------------- track_labelFields = 'labelFields name, transcripts, rna_biotypes, cell_types, orf_annotations, kozak_contexts' # Write the trackDb file # ---------------------- track_content = [ track_track, track_shortLabel, track_longLabel, track_html, track_bigDataUrl, track_type, track_default_vis, track_filter_transcripts, track_filter_rnabiotypes, track_filter_celltypes, track_filter_kozakcontexts, track_labelFields, track_urls ] track_content = '\n'.join( track_content ) + '\n' # Create the output folder if necessary if ( not os.path.isdir( self.output_folder ) ): os.makedirs( self.output_folder ) track_db_file_path = os.path.join( self.output_folder, self.filename + self.TRACK_DB_FILE_EXTENSION ) with open( track_db_file_path, 'w' ) as track_db_file: track_db_file.write( track_content ) Logger.get_instance().info( 'The trackDb file has been successfully created and saved at ' + track_db_file_path + '.' ) # Create BigBed file if necessary # ------------------------------- if self.generate_bigbed: # Overwrite / define some options necessary # to run the GenerateBEDFile strategy OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_CONVERT_TO_BIGBED, option_value = True ) # Output folder is the same than the one of the trackDb file OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_OUTPUT_FOLDER, option_value = self.output_folder ) # Defined BigBed filename OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_BED_FILENAME, option_value = GenerateTrackDbFileStrategy.BIGBED_FILENAME ) # Set bigbed format at 12 + 5 OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_BED_EXTENDED, option_value = True ) try: generatebedfilestrategy = GenerateBEDFileStrategy() except Exception as e: raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred' + ' trying to instantiate a GenerateBEDFileStrategy.', e ) try: generatebedfilestrategy.execute() except Exception as e: raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred' + ' during the execution of the GenerateBEDFile strategy.', e )
def compute_tr_cds_relative_coordinates( self ): Logger.get_instance().info( 'Starting the computation of relative CDS transcript start and stop' + ' coordinates (registered in the Transcript table).') # Get all the transcript for which there are CDS # start and stop positions provided # NB: Query is performed using raw SQL statement for better efficiency transcript_info_sql_statement = 'SELECT Transcript.id, Transcript.transcript_id AS tr_id, \ Transcript.gene_id, PROGene.chromosome, \ Transcript.cds_start_pos AS start_pos, \ Transcript.cds_stop_pos AS end_pos \ FROM Transcript \ INNER JOIN PROGene ON PROGene.gene_id = Transcript.gene_id \ WHERE ( Transcript.cds_start_pos IS NOT NULL ) \ AND ( Transcript.cds_stop_pos IS NOT NULL )' if ( not self.force_overwrite ): transcript_info_sql_statement += ' AND ( ( Transcript.rel_cds_start_pos IS NULL ) \ OR ( Transcript.rel_cds_stop_pos IS NULL ) )' transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_tr_cds_relative_coordinates(): ' + str( transcript_info_df.shape[0] ) + ' Transcript entries are' + ' expected to be processed.') # As the conversion of coordinates in R may be highly time-consuming, # split the data frame into small data frames and multi-process the # computation # Split the data frame into smaller data frames that can be processed # independently from each other subset_data_frames = [ transcript_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \ for min_bound in xrange( 0, transcript_info_df.shape[ 0 ], Constants.MAX_ENTRIES_PER_DATAFRAME ) ] # For each of the subset data frame, process it with R in order # to build a dataset containing the start and stop relative # coordinates. # Instantiate the list of tuple-embedded arguments necessary to # compute the relative coordinates args_to_run_r = [] filename_prefix = self.TRANSCRIPT_CSV_FILE_PREFIX filename_suffix = 0 for df in subset_data_frames: args_to_run_r.append( ( df, self.species, self.ensembl_release_version, filename_prefix, filename_suffix ) ) filename_suffix += 1 # Instantiate the pool of processes p = Pool( self.thread_nb ) messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r ) p.close() # Wait for all processes to be completed p.join() # Log the messages generated by the processes for messages in messages_to_log: ( debug_messages_to_log, stdout, stderr ) = messages for message in debug_messages_to_log: Logger.get_instance().debug( message ) if ( stdout != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following standard output: \n' + stdout ) # NB: As the R function is susceptible to write not error-related # messages in stderr, these messages are also logged at the # debug level if ( stderr != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following error output: \n' + stderr ) # Sequentially open CSV files to get the relative positions # Instantiate a dictionary that associate to the ORFTranscriptAsso ID # the relative start and stop positions of the ORF rel_positions_dict = {} for file_nb in range( filename_suffix ): df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER, filename_prefix + str( file_nb ) + '.csv' ), sep = ',', encoding = 'utf-8' ) for ( index, row ) in df.iterrows(): rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] ) # Add the relative start and stop positions for all the ORFTranscriptAsso entries all_transcripts = SQLManagerPRO.get_instance().get_session().query( Transcript ).filter( Transcript.id.in_( rel_positions_dict.keys() ) ).all() for transcript in all_transcripts: # Get the start and stop positions positions = rel_positions_dict.get( transcript.id ) rel_cds_start_pos = positions[ 0 ] rel_cds_stop_pos = positions[ 1 ] if not pd.isna( rel_cds_start_pos ): transcript.rel_cds_start_pos = int( rel_cds_start_pos ) if not pd.isna( rel_cds_stop_pos ): transcript.rel_cds_stop_pos = int( rel_cds_stop_pos ) # Commit the updates and close the session SQLManagerPRO.get_instance().commit() SQLManagerPRO.get_instance().close_session() # Delete the pool instance p.clear()
def compute_ota_relative_coordinates( self ): Logger.get_instance().info( 'Starting the computation of relative ORF start and stop coordinates' + ' (registered in the ORFTranscriptAsso table).') # Get information related to the ORF # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table: # - Its unique ID in the database # - The ID of its ORF-related entry, as well as the chromosome, # start and stop positions of the ORF # NB: Query is performed using raw SQL statement for better efficiency orf_info_sql_statement = 'SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id,\ ORF.chromosome, ORF.start_pos, ORF.stop_pos AS end_pos \ FROM ORF \ INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.orf_id = ORF.id' if ( not self.force_overwrite ): orf_info_sql_statement += ' WHERE ( ORFTranscriptAsso.rel_start_pos IS NULL ) \ OR ( ORFTranscriptAsso.rel_stop_pos IS NULL)' orf_info_df = pd.read_sql( orf_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() # Get information related to the transcript # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table: # - Its unique ID in the database # - The ID of its Transcript-related entry # NB: All "UNKNOWN_TRANSCRIPT" entries are excluded as an official ID is needed to perform # the conversion. # NB: Query is performed using raw SQL statement for better efficiency transcript_info_sql_statement = "SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.transcript_id, \ Transcript.transcript_id AS tr_id \ FROM Transcript \ INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.transcript_id = Transcript.id \ WHERE Transcript.transcript_id != '" + Constants.UNKNOWN_TRANSCRIPT + "'" transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() # Merge information from the two data frames in order to get # a data frame with the following columns: # - id: The ORFTranscriptAsso unique ID # - orf_id: The ORF unique ID # - chromosome: The ORF chromosome name # - start_pos: The ORF start position # - end_pos: The ORF stop position # - transcript_id: The Transcript unique ID # - tr_id: The transcript official ID (e.g. Ensembl ID) ota_info_df = orf_info_df.merge( transcript_info_df, on='id', how = 'inner', validate = 'one_to_one' ) Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_ota_relative_coordinates(): ' + str( ota_info_df.shape[0] ) + ' ORFTranscriptAsso entries are' + ' expected to be processed.') # As the conversion of coordinates in R may be highly time-consuming, # split the data frame into small data frames and multi-process the # computation # Split the data frame into smaller data frames that can be processed # independently from each other subset_data_frames = [ ota_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \ for min_bound in xrange( 0, ota_info_df.shape[ 0 ], Constants.MAX_ENTRIES_PER_DATAFRAME ) ] # For each of the subset data frame, process it with R in order # to build a dataset containing the start and stop relative # coordinates. # Instantiate the list of tuple-embedded arguments necessary to # compute the relative coordinates args_to_run_r = [] filename_prefix = self.OTA_CSV_FILE_PREFIX filename_suffix = 0 for df in subset_data_frames: args_to_run_r.append( ( df, self.species, self.ensembl_release_version, filename_prefix, filename_suffix ) ) filename_suffix += 1 # Instantiate the pool of processes p = Pool( self.thread_nb ) messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r ) p.close() # Wait for all processes to be completed p.join() # Log the messages generated by the processes for messages in messages_to_log: ( debug_messages_to_log, stdout, stderr ) = messages for message in debug_messages_to_log: Logger.get_instance().debug( message ) if ( stdout != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following standard output: \n' + stdout ) # NB: As the R function is susceptible to write not error-related # messages in stderr, these messages are also logged at the # debug level if ( stderr != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following error output: \n' + stderr ) # Sequentially open CSV files to get the relative positions # Instantiate a dictionary that associate to the ORFTranscriptAsso ID # the relative start and stop positions of the ORF rel_positions_dict = {} for file_nb in range( filename_suffix ): df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER, filename_prefix + str( file_nb ) + '.csv' ), sep = ',', encoding = 'utf-8' ) for ( index, row ) in df.iterrows(): rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] ) # Add the relative start and stop positions for all the ORFTranscriptAsso entries all_ota = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso ).filter( ORFTranscriptAsso.id.in_( rel_positions_dict.keys() ) ).all() for ota in all_ota: # Get the start and stop positions positions = rel_positions_dict.get( ota.id ) rel_start_pos = positions[ 0 ] rel_stop_pos = positions[ 1 ] if not pd.isna( rel_start_pos ): ota.rel_start_pos = int( rel_start_pos ) if not pd.isna( rel_stop_pos ): ota.rel_stop_pos = int( rel_stop_pos ) # Commit the updates and close the session SQLManagerPRO.get_instance().commit() SQLManagerPRO.get_instance().close_session() # Delete the pool instance p.clear()
def execute( self ): Logger.get_instance().info( 'IMPORTANT: This strategy has been built in order to be able to' + ' convert exclusively coordinates related to a Transcript entry that' + ' has an Ensembl transcript ID as "transcript_id" attribute.' + ' Hence, if the database contains IDs related to another database,' + ' then the source code of this strategy has to be modified in order to' + ' convert these IDs into Ensembl IDs.' ) # Run DatabaseCheck in order to check PRO database is reachable and use # the appropriate models prior to the merging of data. Logger.get_instance().info( 'Checking the PRO database prior to compute missing information...' ) try: DatabaseCheckStrategy().execute() except Exception as e: raise DenCellORFException( ' An error occurred whilst checking the database prior to' + ' compute missing information.' + '\n Error code: ' + LogCodes.ERR_DBCHECK + '.', e ) # Get the name of the species used in the database self.species = DataManager.get_instance().get_data( Constants.SPECIES_SHORT ) # Get the Ensembl release version used in the database prometadata_ensembl_release = SQLManagerPRO.get_instance().get_session().query( PROMetadata ).filter( PROMetadata.parameter == Constants.METATABLE_CURRENT_ENSEMBL_RELEASE ).one() self.ensembl_release_version = prometadata_ensembl_release.value # Check there is at least one ORFTranscriptAsso entry in the database prior # to try to convert the absolute coordinates into relative coordinates. # NB: The presence of entries in the Transcript tables will obviously be # implicitly checked at the same time. orftranscriptasso_count = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso ).count() if ( orftranscriptasso_count == 0 ): raise DenCellORFException( 'There is not any entry in the ORFTranscriptAsso table of the ' + SQLManagerPRO.get_instance().db_name + ' database (PRO database).' + ' Hence, the conversion of absolute coordinates into relative' + ' coordinates will be stopped.' ) SQLManagerPRO.get_instance().close_session() # Set the R_LIBS_USER environment package to install new R # packages in a folder where the user has the writing right if ( not os.path.exists( Constants.CUSTOM_R_LIBRARY_FOLDER ) ): os.makedirs( Constants.CUSTOM_R_LIBRARY_FOLDER ) os.environ['R_LIBS_USER'] = Constants.CUSTOM_R_LIBRARY_FOLDER # As the computation of relative coordinates is performed # using R scripts relying on the ensembldb packages and # annotation packages, first make sure the appropriate # annotation package is available. If not build it. Logger.get_instance().debug( 'ComputeRelCoordStrategy.execute(): Preparing the R annotation' + ' package to perform the computation of relative coordinates' + ' (ensembl release: ' + str( self.ensembl_release_version ) + ')...' ) self.prepare_r_annotation_package( species_short_name = self.species, species_full_name = Constants.SPECIES_CATALOG_FULL_NAMES_WITH_CAPS[ self.species ], species_common_name = Constants.SPECIES_CATALOG_COMMON_NAMES[ self.species ], ensembl_release_version = self.ensembl_release_version ) # Create a new folder that will be used to create temporary # csv files necessary to the computation of relative coordinates if ( not os.path.exists( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER ) ): os.makedirs( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER ) # ================================================================================ # INFORMATION ABOUT THE MULTI-PROCESSING # # In order to lower as most as possible the computation time, the computation # of relative coordinates is multi-processed (concurrent R scripts subprocesses # run in parallel). # # Important information regarding the multi-processing: # - Multi-processing has been chosen instead of multi-threading, in particular # to side-step the GIL (Global Interpreter Lock). # - The processes use all available / provided CPUs to run. # - The pathos package has been chosen as it allows to serialize functions which # are not top-levels, such as class static methods (contrary to the # multiprocessing built-in package for instance). # - The processes are run in pools which is one of the most convenient mean to # parallelize the execution of a function across multiple inputs. The Pool # map() method is used to do so. # - As access of objects shared by the several processes (using locks and # semaphores for instance), could slower a lot the speed of execution when the # process regularly need to access these variables, it has been decided to do # not access to shared resources. Then the progression bar is not displayed on # screen for this step. # - In order to use efficiently the Pool map() method, the arguments needed by the # forked function are embedded into tuples of fixed-size. # # ================================================================================ # Compute the the start and stop relative coordinates # in the ORFTranscriptAsso table self.compute_ota_relative_coordinates() # Compute the the start and stop CDS relative coordinates # in the Transcript table self.compute_tr_cds_relative_coordinates()
def execute(self): # Create a session to the "PRO-like" database SQLManagerPRO.get_instance().set_db_settings(self.db_settings) try: SQLManagerPRO.get_instance().get_session() except Exception as e: raise DenCellORFException( 'GenerateFastaFileStrategy.execute(): An error occurred trying to' + ' create a session to the database.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) SQLManagerPRO.get_instance().close_session() Logger.get_instance().info('Starting to build the FASTA file.') Logger.get_instance().info( 'The fasta file will be created querying the ' + self.table_type + ' table and using the ' + self.seq_type + ' sequences.') # Create the output folder if it does not yet exist # (and its parent folders if necessary ) if (not os.path.isdir(self.output_folder)): os.makedirs(self.output_folder) file_path = os.path.join(self.output_folder, self.filename) + self.FASTA_FILE_EXTENSION # Get the name of the species sp = SQLManagerPRO.get_instance().get_session().query( PROSpeciesCatalog).one().name SQLManagerPRO.get_instance().close_session() # Get the informations related to the species # NB: These information will be used in the headers taxon_sc_name = Constants.SPECIES_CATALOG_FULL_NAMES_WITH_CAPS[sp] taxon_code = Constants.SPECIES_CATALOG_CODE[sp] taxon_id = Constants.SPECIES_CATALOG_TAXON_ID[sp] # Get the version number of the database db_release = SQLManagerPRO.get_instance().get_session().query( PROMetadata.value).filter(PROMetadata.parameter == Constants. METATABLE_DATABASE_VERSION_NUMBER).all() db_release = GeneralUtil.query_result_to_list(db_release) if (len(db_release) == 1): db_release = db_release[0] else: db_release = '' # Create the FASTA file # --------------------- # Get the information necessary to compute the file content if (self.table_type == self.ORF_TABLE): # Get the necessary information from the ORF table all_orfs_query = SQLManagerPRO.get_instance().get_session().query( ORF.id, ORF.chromosome, ORF.strand, ORF.start_pos, ORF.stop_pos, ORF.spliced_parts_count, eval('ORF.' + self.seq_attribute_name)).filter( eval('ORF.' + self.seq_attribute_name) != None) else: # Get the necessary information from the ORFTranscriptAsso table all_orfs_query = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id, ORFTranscriptAsso.transcript_id, ORFTranscriptAsso.rel_start_pos, ORFTranscriptAsso.rel_stop_pos, eval('ORFTranscriptAsso.' + self.seq_attribute_name)).filter( eval('ORFTranscriptAsso.' + self.seq_attribute_name) != None) # Run the query and get the results as a Pandas data frame all_orfs_df = pd.read_sql(all_orfs_query.statement, SQLManagerPRO.get_instance().get_engine()) SQLManagerPRO.get_instance().close_session() # Check the query returned a result total_sqce_count = all_orfs_df.shape[0] if (total_sqce_count == 0): raise DenCellORFException( 'It seems that the database you are querying do not contain any' + ' entry with sequence (' + self.seq_type + ') in its ' + self.table_type + ' table. Hence, the generation of the fasta file' + ' has been stopped.') all_orfs_df = all_orfs_df.astype(str) # If the excludeSqcesWithStop option has been selected, # then exclude from the data frame all the sequences # that contains at least a stop, then remove it if self.exclude_sqce_with_stops: contains_stop_codon = all_orfs_df.apply( self.check_stop_codons_in_sqce, seq_type=self.seq_type, seq_attribute_name=self.seq_attribute_name, axis=1).to_frame() contains_stop_codon = contains_stop_codon.rename( columns={0: 'contains_stop_codon'}) all_orfs_df = pd.concat([all_orfs_df, contains_stop_codon], axis=1) # Extract from the data frame the ORF for which the sequence # do not contains stop codons all_orfs_df = all_orfs_df[all_orfs_df.contains_stop_codon == False] Logger.get_instance().info( str(total_sqce_count - all_orfs_df.shape[0]) + ' sequences (/' + str(total_sqce_count) + ') have been removed as they were containing stop codons') # For each row, build a string that will be used # as header line in the FASTA file header = all_orfs_df.apply(self.generate_header, axis=1, taxon_sc_name=taxon_sc_name, taxon_code=taxon_code, taxon_id=str(taxon_id), table=self.table_type, db_release=db_release, long_header=self.long_header).to_frame() header = header.rename(columns={0: 'header'}) all_orfs_df = pd.concat([all_orfs_df, header], axis=1) # Write the FASTA file one line at a time with open(file_path, 'w') as fasta_file: for (index, row) in all_orfs_df.iterrows(): # Write the header line fasta_file.write('>' + row['header'] + '\n') # Write the sequence line(s) # Split the sequence if it has to be written on several lines full_seq = row[self.seq_attribute_name] seq = '\n'.join([ full_seq[k:k + self.MAX_SEQ_LINE_LENGTH] for k in range(0, len(full_seq), self.MAX_SEQ_LINE_LENGTH) ]) # Write the sequence line(s) fasta_file.write(seq + '\n') Logger.get_instance().info('The fasta file has been created at ' + file_path + '.')
def execute(self): # Set the connection to the database self.get_sqlmanager_instance().set_db_settings(self.db_settings) try: self.get_sqlmanager_instance().get_instance().get_session() self.get_sqlmanager_instance().get_instance().close_session() except DenCellORFException as e: raise DenCellORFException( 'RestoreStrategy.execute(): An error occurred while trying to' + ' create a session to the database.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e) # Check if the database already exists. # If it exists, then ask the user to confirm the deletion of the database. if ((not self.force_overwrite) and (self.get_sqlmanager_instance().db_exists())): confirm_deletion = None Logger.get_instance().info( 'A database already exists at the provided connection settings.' + ' Hence, any existing data nooeds to be removed prior the insertion' + ' of the data to restore.') while (confirm_deletion not in ['Y', 'N']): print( 'Do you want to confirm the deletion of the database? (Y/N)' ) confirm_deletion = raw_input().upper() if (confirm_deletion == 'N'): Logger.get_instance().critical( 'As a database already exists at the provided connection' + ' settings and as the deletion of existing data has been' + ' canceled by the user, the program will be stopped.' + ' Please see the documentation for more information.') # (Re-)create the empty database self.get_sqlmanager_instance().build_database( db_settings=self.db_settings, species=None, sp_mandatory=False, force_overwrite=True) # Get the appropriate order in which the tables needs to be filled in order_of_insertion = eval('self.' + self.db_model + '_ORDER_OF_INSERTION') # For each table of the list, get the corresponding file, # upload the content and insert the data in the database for tablename in order_of_insertion: Logger.get_instance().debug( 'Starting to load and insert the data saved from the table ' + tablename + '.') # Get the name of the file (without its extension) if self.file_prefix: filename = self.file_prefix + tablename else: filename = tablename # Get the content of the file try: objects_to_insert = FileHandlerUtil.get_obj_from_file( input_folder=self.input_folder, filename=filename) except Exception as e: raise DenCellORFException( 'A error occurred trying to import the objects to insert in the ' + tablename + 'table.') Logger.get_instance().debug( str(len(objects_to_insert)) + ' entries are expected' + ' to be inserted into the ' + tablename + ' table.') # Insert the data # NB: Using the add_all() method of the session does not work (probably because # the objects saved in the file were mapped to the session). Hence, it is # necessary to add the objects one at a time using the merge method. # Get the number total number of elements expected to be treated and # reset the ProgressionBar instance to follow the progression ProgressionBar.get_instance().reset_instance( total=len(objects_to_insert)) for entry in objects_to_insert: # Update and display the progression bar on the console ProgressionBar.get_instance().increase_and_display() try: self.get_sqlmanager_instance().get_session().merge(entry) except Exception as e: raise DenCellORFException( 'An error occurred trying to insert the data into the ' + tablename + ' table. Please make sure the backup occurred' + ' successfully', e) # Commit the session try: self.get_sqlmanager_instance().commit() except Exception as e: raise DenCellORFException( 'An error occurred trying to commit changes after insertion' + ' of data in the ' + tablename + ' table.' + '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.') entry_count = self.get_sqlmanager_instance().get_session().query( eval(tablename)).count() Logger.get_instance().debug( str(entry_count) + ' entries have been successfully added' + ' to the ' + tablename + ' table.') self.get_sqlmanager_instance().close_session() # Log the end of the restoration Logger.get_instance().info('Restoration of the database has finished.')