def populate_combobox_experiment_id(self): ''' Populate data in "combobox_experiment_id". ''' # clear the value selected in the combobox self.wrapper_experiment_id.set('') # initialize the experiment identification list experiment_id_list = [] # get the experiment identifications command = 'ls {0}'.format(xlib.get_cluster_result_dir()) (OK, stdout, stderr) = xssh.execute_cluster_command(self.ssh_client, command) if OK: for line in stdout: line = line.rstrip('\n') if line != 'lost+found': experiment_id_list.append(line) # verify if there are any experimment identifications if experiment_id_list == []: message = 'The cluster has not experiment data.' tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message) return # load the names of clusters which are running in the combobox self.combobox_experiment_id['values'] = experiment_id_list
def combobox_cluster_name_selected_item(self, event=None): ''' Process the event when an item of "combobox_cluster_name" has been selected ''' # set cursor to show busy status self.main.config(cursor='watch') self.main.update() # verify if the cluster name selected is different to the previous cluster name if self.wrapper_cluster_name.get() != self.cluster_name_ant: # close SSH client connection if self.cluster_name_ant is not None: xssh.close_ssh_client_connection(self.ssh_client) # create the SSH client connection (OK, error_list, self.ssh_client) = xssh.create_ssh_client_connection(self.wrapper_cluster_name.get(), 'master') if not OK: message = '' for error in error_list: message = '{0}{1}\n'.format(message, error) tkinter.messagebox.showerror('{0} - {1}'.format(xlib.get_project_name(), self.head), message) self.close() # save current cluster name as previous cluster name self.cluster_name_ant = self.wrapper_cluster_name.get() # load data in "combobox_experiment_id" self.populate_combobox_experiment_id() # set cursor to show normal status self.main.config(cursor='') self.main.update()
def print_headers_without_environment(process_name): ''' Print the headers of a screen withtout the environment information. ''' # print the project name, version and the process name title = '{0} v {1} - {2}'.format(xlib.get_project_name(), xlib.get_project_version(), process_name) line = '-' * len(title) print('+-{0}-+'.format(line)) print('| {0} |'.format(title)) print('+-{0}-+'.format(line)) print()
def populate_combobox_cluster_name(self): ''' Populate data in "combobox_cluster_name". ''' # clear the value selected in the combobox self.wrapper_cluster_name.set('') # verify if there are some running clusters running_cluster_list = xec2.get_running_cluster_list(volume_creator_included=False) if running_cluster_list == []: message = 'There is not any running cluster.' tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message) return # load the names of clusters which are running in the combobox self.combobox_cluster_name['values'] = running_cluster_list
def build_parser(): ''' Build the parser with the available arguments. ''' # import the module xlib import xlib # create the parser and add arguments description = 'Description: This program start NGScloud2 both console mode and gui mode.' text = '{0} v{1} - {2}\n\n{3}\n'.format(xlib.get_project_name(), xlib.get_project_version(), os.path.basename(__file__), description) usage = '\r{0}\nUsage: {1} arguments'.format(text.ljust(len('usage:')), os.path.basename(__file__)) parser = argparse.ArgumentParser(usage=usage) parser._optionals.title = 'Arguments' parser.add_argument('--mode', dest='mode', help='Mode: console or gui') # return the paser return parser
def print_headers_with_environment(process_name): ''' Print the headers of a screen with environmen information. ''' # print the project name, version and the process name title = '{0} v {1} - {2}'.format(xlib.get_project_name(), xlib.get_project_version(), process_name) line = '-' * len(title) print('+-{0}-+'.format(line)) print('| {0} |'.format(title)) print('+-{0}-+'.format(line)) print() # get current region and zone names region_name = xconfiguration.get_current_region_name() zone_name = xconfiguration.get_current_zone_name() # print the environment and the current region and zone names print('Environment: {0} - Region: {1} - Zone: {2}'.format( xconfiguration.environment, region_name, zone_name)) print()
def form_create_ngscloud_config_file(is_menu_call): ''' Create the NGScloud config file corresponding to the environment. ''' # initialize the control variable OK = True # print the header if is_menu_call: clib.clear_screen() clib.print_headers_with_environment('Configuration - Recreate TransciptomeCloud config file') # get current region and zone names region_name = xconfiguration.get_current_region_name() zone_name = xconfiguration.get_current_zone_name() # get basic AWS data and contact e-mail address from NGScloud config file (user_id, access_key_id, secret_access_key) = xconfiguration.get_basic_aws_data() email = xconfiguration.get_contact_data() # confirm or change the AWS data and contact e-mail address print(xlib.get_separator()) user_id = cinputs.input_user_id(user_id) access_key_id = cinputs.input_access_key_id(access_key_id) secret_access_key = cinputs.input_secret_access_key(secret_access_key) email = cinputs.input_email(email) # verify the AWS access key identification and the AWS secret access key print(xlib.get_separator()) print('Verifying the AWS access key identification and the AWS secret access key') OK = xec2.verify_aws_credentials(access_key_id, secret_access_key) if OK: print('The credentials are OK.') else: print('ERROR: The credentials are wrong. Please review your access key identification and secret access key in the AWS web.') if not is_menu_call: raise xlib.ProgramException('EXIT') # confirm the creation of the NGScloud config file if OK: if is_menu_call: print(xlib.get_separator()) OK = clib.confirm_action('The {0} config file is going to be created. The previous files will be lost.'.format(xlib.get_project_name())) # create the NGScloud config file corresponding to the environment if OK: print(xlib.get_separator()) print('The file {0} is being created ...'.format(xconfiguration.get_ngscloud_config_file())) (OK, error_list) = xconfiguration.create_ngscloud_config_file(user_id, access_key_id, secret_access_key, email) if OK: print('The config file is created with default values.') print() print('You can modify the conection data and contact e-mail address in:') print(' "Cloud control" -> "Configuration" -> "Update connection data and contact e-mail"') print('The assigned region and zone are {0} and {1}, respectively. You can modify them in:'.format(xconfiguration.get_default_region_name(), xconfiguration.get_default_zone_name())) print(' "Cloud control" -> "Configuration" -> "Update region and zone data"') else: for error in error_list: print(error) raise xlib.ProgramException('C001') # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def build_gmap_process_script(cluster_name, current_run_dir): ''' Build the current GMAP process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the GMAP option dictionary gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) # get the options experiment_id = gmap_option_dict['identification']['experiment_id'] reference_dataset_id = gmap_option_dict['identification']['reference_dataset_id'] reference_file = gmap_option_dict['identification']['reference_file'] assembly_software = gmap_option_dict['identification']['assembly_software'] assembly_dataset_id = gmap_option_dict['identification']['assembly_dataset_id'] assembly_type = gmap_option_dict['identification']['assembly_type'] threads = gmap_option_dict['GMAP parameters']['threads'] kmer = gmap_option_dict['GMAP parameters']['kmer'] sampling = gmap_option_dict['GMAP parameters']['sampling'] input_buffer_size = gmap_option_dict['GMAP parameters']['input-buffer-size'] output_buffer_size = gmap_option_dict['GMAP parameters']['output-buffer-size'] prunelevel = gmap_option_dict['GMAP parameters']['prunelevel'] format = gmap_option_dict['GMAP parameters']['format'] other_parameters = gmap_option_dict['GMAP parameters']['other_parameters'] # set the cluster reference dataset directory cluster_reference_dataset_dir = xlib.get_cluster_reference_dataset_dir(reference_dataset_id) # set the cluster reference file cluster_reference_file = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the GMAP database name reference_file_name, reference_file_extension = os.path.splitext(reference_file) gmap_database = '{0}-gmap_database'.format(reference_file_name) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # set the output file path output_file = 'gmap_output_{0}.txt'.format(format.lower()) # get the GMAP process script name gmap_process_script = get_gmap_process_script() # write the GMAP process script try: if not os.path.exists(os.path.dirname(gmap_process_script)): os.makedirs(os.path.dirname(gmap_process_script)) with open(gmap_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('GMAP_GSNAP_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$GMAP_GSNAP_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function build_gmap_database')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap_build \\')) file_id.write('{0}\n'.format(' --dir={0}\\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0}\\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) file_id.write('{0}\n'.format(' {0}'.format(cluster_reference_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_gmap_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' gmap --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap \\')) file_id.write('{0}\n'.format(' --nthreads={0} \\'.format(threads))) file_id.write('{0}\n'.format(' --dir={0} \\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0} \\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) if sampling.upper() != 'NONE': file_id.write('{0}\n'.format(' --sampling={0} \\'.format(sampling))) file_id.write('{0}\n'.format(' --input-buffer-size={0} \\'.format(input_buffer_size))) file_id.write('{0}\n'.format(' --output-buffer-size={0} \\'.format(output_buffer_size))) file_id.write('{0}\n'.format(' --prunelevel={0} \\'.format(prunelevel))) if format.upper() == 'COMPRESS': file_id.write('{0}\n'.format(' --compress \\')) elif format.upper() == 'SUMMARY': file_id.write('{0}\n'.format(' --summary \\')) elif format.upper() == 'ALIGN': file_id.write('{0}\n'.format(' --align \\')) else: file_id.write('{0}\n'.format(' --format={0} \\'.format(format.lower()))) file_id.write('{0}\n'.format(' --ordered \\')) file_id.write('{0}\n'.format(' --nofails \\')) if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() file_id.write('{0}\n'.format(' --{0}={1} \\'.format(parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() file_id.write('{0}\n'.format(' --{0} \\'.format(parameter_name))) file_id.write('{0}\n'.format(' {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' > {0}'.format(output_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error gmap $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('build_gmap_database')) file_id.write('{0}\n'.format('run_gmap_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(gmap_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def create_cluster(template_name, cluster_name, log, function=None, is_menu_call=True): ''' Create a cluster from a template name. ''' # initialize the control variable OK = True # initialize the state variables master_state_code = '' master_state_name = '' # get current region and zone names region_name = xconfiguration.get_current_region_name() zone_name = xconfiguration.get_current_zone_name() # warn that the log window does not have to be closed if not isinstance(log, xlib.DevStdOut) and is_menu_call: log.write( 'This process might take several minutes. Do not close this window, please wait!\n' ) # warn that the requirements are being verified log.write(f'{xlib.get_separator()}\n') log.write('Checking process requirements ...\n') # check that the cluster is defined in the NGScloud config file if OK: if not xconfiguration.is_template_defined(template_name): log.write( '*** ERROR: The cluster {0} is not defined in the {1} config file.\n' .format(cluster_name, xlib.get_project_name())) OK = False # check that the cluster mode is None if OK: if xec2.get_cluster_mode(cluster_name) is not None: log.write('*** ERROR: There is a cluster or a instance running.\n') OK = False # check that the zone is available if OK: if not xec2.is_zone_available(region_name, zone_name): log.write( '*** ERROR: The zone name {0} is not available.\n'.format( zone_name)) OK = False # warn that the requirements are OK if OK: log.write('Process requirements are OK.\n') # create the cluster if OK: log.write(f'{xlib.get_separator()}\n') if cluster_name == xlib.get_volume_creator_name(): log.write('Creating the volume creator using StarCluster ...\n') else: log.write( 'Creating the cluster {0} using StarCluster ...\n'.format( cluster_name)) log.write('\n') if template_name == xlib.get_volume_creator_name(): command = '{0} --region={1} start --availability-zone={2} --cluster-template={3} --disable-queue {4}'.format( xlib.get_starcluster(), region_name, zone_name, template_name, cluster_name) else: command = '{0} --region={1} start --availability-zone={2} --cluster-template={3} {4}'.format( xlib.get_starcluster(), region_name, zone_name, template_name, cluster_name) rc = xlib.run_command(command, log) log.write('\n') if rc == 0: (master_state_code, master_state_name) = xec2.get_node_state(cluster_name, node_name='master') if cluster_name == xlib.get_volume_creator_name(): log.write('The volume creator is created.\n') else: log.write('The cluster is created.\n') else: log.write('*** ERROR: Return code {0} in command -> {1}\n'.format( rc, command)) log.write('***') log.write( '*** You have to terminate {0} (option "Force termination of a cluster")\n' .format(cluster_name)) log.write('*** and create it again.\n') OK = False # install infrastructure software in every node of the cluster if OK: if cluster_name != xlib.get_volume_creator_name(): cluster_node_list = xec2.get_cluster_node_list(cluster_name) for node_name in cluster_node_list: OK = xnode.install_node_infrastructure_software( cluster_name, node_name, log) # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut) and is_menu_call: log.write(f'{xlib.get_separator()}\n') log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable and the state return (OK, master_state_code, master_state_name)
def build_fastqc_process_script(cluster_name, current_run_dir): ''' Build the current FastQC process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the FastQC option dictionary fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file()) # get the options experiment_id = fastqc_option_dict['identification']['experiment_id'] read_dataset_id = fastqc_option_dict['identification']['read_dataset_id'] threads = fastqc_option_dict['FastQC parameters']['threads'] # get the sections list sections_list = [] for section in fastqc_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the file name list file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): file_name = fastqc_option_dict[section]['file_name'] file_name_list.append(file_name) # write the FastQC process script try: if not os.path.exists(os.path.dirname(get_fastqc_process_script())): os.makedirs(os.path.dirname(get_fastqc_process_script())) with open(get_fastqc_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'FASTQC_PATH={0}/{1}/envs/{2}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_fastqc_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$FASTQC_PATH:$PATH')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format( xlib.get_fastqc_bioconda_code()))) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_fastqc_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' fastqc --version')) for file_name in file_name_list: file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format(' fastqc \\')) file_id.write('{0}\n'.format(' {0} \\'.format( xlib.get_cluster_read_file(experiment_id, read_dataset_id, file_name)))) file_id.write('{0}\n'.format( ' --threads={0} \\'.format(threads))) file_id.write('{0}\n'.format( ' --outdir={0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error fastqc $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_fastqc_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_fastqc_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_fastqc_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_fastqc_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_fastqc_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_fastqc_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) # set the output file path if OK: output_file = '{0}/clustered-transcriptome.fasta'.format( current_run_dir) # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'CDHIT_PATH={0}/{1}/envs/{2}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$CDHIT_PATH:$PATH')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format( xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_cd_hit_est_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Running {0} process ..."'.format( xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format(' cd-hit-est \\')) file_id.write('{0}\n'.format( ' -T {0} \\'.format(threads))) file_id.write('{0}\n'.format( ' -M {0} \\'.format(memory_limit))) file_id.write('{0}\n'.format( ' -i {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format( ' -c {0} \\'.format(seq_identity_threshold))) file_id.write('{0}\n'.format( ' -n {0} \\'.format(word_length))) file_id.write('{0}\n'.format( ' -mask {0} \\'.format(mask))) file_id.write('{0}\n'.format( ' -match {0} \\'.format(match))) file_id.write('{0}\n'.format( ' -mismatch {0} \\'.format(mismatch))) if other_parameters.upper() == 'NONE': file_id.write('{0}\n'.format( ' -o {0}'.format(output_file))) else: file_id.write('{0}\n'.format( ' -o {0} \\'.format(output_file))) parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: file_id.write('{0}\n'.format( ' -{0} {1} \\'.format( parameter_name, parameter_value))) else: file_id.write('{0}\n'.format( ' -{0} {1}'.format( parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): file_id.write('{0}\n'.format( ' -{0} \\'.format(parameter_name))) else: file_id.write('{0}\n'.format( ' -{0}'.format(parameter_name))) i += 1 file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_cd_hit_est_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_cd_hit_est_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def validate_cd_hit_est_config_file(strict): ''' Validate the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif assembly_software not in [ xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code() ]: error_list.append( '*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.' .format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code() ) and not assembly_dataset_id.startswith(xlib.get_transabyss_code( )) and not assembly_dataset_id.startswith(xlib.get_trinity_code( )) and not assembly_dataset_id.startswith(xlib.get_star_code( )) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code( )) and not assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): error_list.append( '*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.' .format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append( '*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.' .format(xlib.get_soapdenovotrans_name())) OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code( )) or assembly_dataset_id.startswith(xlib.get_trinity_code( )) or assembly_dataset_id.startswith( xlib.get_star_code()) or assembly_dataset_id.startswith( xlib.get_cd_hit_est_code( )) or assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append( '*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.' .format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(threads) < 0: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(memory_limit) < 0: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if float(seq_identity_threshold) < 0.0 or float( seq_identity_threshold) > 1.0: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False except: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(word_length) < 1: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False except: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found).upper() if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(match) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(mismatch) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: if other_parameters.upper() != 'NONE': parameter_list = [ x.strip() for x in other_parameters.split(';') ] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append( '*** ERROR: the value of the key "other_parameters" in the section "CD-HIT-EST parameters" must be NONE or a valid parameter list.' ) OK = False break if parameter_name in not_allowed_parameters_list: error_list.append( '*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "CD-HIT-EST parameters" because it is controled by {1}.' .format(parameter_name, xlib.get_project_name())) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append( '\nThe {0} config file is not valid. Please, correct this file or recreate it.' .format(xlib.get_cd_hit_est_name())) # return the control variable and the error list return (OK, error_list)
def execute(self, event=None): ''' Execute the list the submission logs in the local host. ''' # validate inputs OK = self.validate_inputs() if not OK: message = 'Some input values are not OK.' tkinter.messagebox.showerror('{0} - {1}'.format(xlib.get_project_name(), self.head), message) # get the local process dictionary local_process_dict = xlib.get_local_process_dict() # build the log dictionary if OK: log_dict = {} if self.wrapper_local_process_text.get() == ' all': command = xlib.list_log_files_command('all') else: command = xlib.list_log_files_command(self.local_process_id) output = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in output.stdout.split('\n'): if line != '': line = os.path.basename(line) run_id = line try: pattern = r'^(.+)\-(.+)\-(.+)\-(.+).txt$' mo = re.search(pattern, line) environment = mo.group(1) local_process_id = mo.group(2).strip() yymmdd = mo.group(3) hhmmss = mo.group(4) process_text = local_process_dict[local_process_id]['text'] date = '20{0}-{1}-{2}'.format(yymmdd[:2], yymmdd[2:4], yymmdd[4:]) time = '{0}:{1}:{2}'.format(hhmmss[:2], hhmmss[2:4], hhmmss[4:]) except: process_text = 'unknown process' date = '0000-00-00' time = '00:00:00' log_dict[run_id] = {'run_id': run_id, 'process_text': process_text, 'date': date, 'time': time} # verify if there are any nodes running if OK: if log_dict == {}: message = 'There is not any local process log.' tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message) OK = False # build the data list if OK: data_list = ['run_id', 'process_text', 'date', 'time'] # build the data dictionary if OK: data_dict = {} data_dict['run_id'] = {'text': 'Run id', 'width': 300, 'aligment': 'left'} data_dict['process_text'] = {'text': 'Process', 'width': 300, 'aligment': 'left'} data_dict['date'] = {'text': 'Date', 'width': 80, 'aligment': 'right'} data_dict['time'] = {'text': 'Time', 'width': 80, 'aligment': 'right'} # create the dialog Table to list the local process logs if OK: dialog_table = gdialogs.DialogTable(self, 'Local process log', 400, 900, data_list, data_dict, log_dict, 'view_submission_logs') self.wait_window(dialog_table) # close the form if OK: self.close()
def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification']['assembly_software'] assembly_dataset_id = busco_option_dict['identification']['assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data = busco_option_dict['BUSCO parameters']['lineage_data'] lineage_data_file = '{0}.tar.gz'.format(lineage_data) lineage_data_url = 'http://busco.ezlab.org/v2/datasets/{0}'.format(lineage_data_file) mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters']['augustus_options'].upper() # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('BUSCO_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('export PATH=$BUSCO_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function download_lineage_data')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Downloading lineage data ..."')) file_id.write('{0}\n'.format(' wget --quiet --output-document ./{0} {1}'.format(lineage_data_file, lineage_data_url))) file_id.write('{0}\n'.format(' tar -xzvf ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format(' rm ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_busco_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' run_BUSCO.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' run_BUSCO.py \\')) file_id.write('{0}\n'.format(' --cpu={0} \\'.format(ncpu))) file_id.write('{0}\n'.format(' --lineage_path=./{0} \\'.format(lineage_data))) file_id.write('{0}\n'.format(' --mode={0} \\'.format(mode))) file_id.write('{0}\n'.format(' --evalue={0} \\'.format(evalue))) file_id.write('{0}\n'.format(' --limit={0} \\'.format(limit))) if species.upper() != 'NONE': file_id.write('{0}\n'.format(' --species={0} \\'.format(species))) if long == 'YES': file_id.write('{0}\n'.format(' --long \\')) if augustus_options.upper() != 'NONE': file_id.write('{0}\n'.format(" --august_options='{0}' \\".format(augustus_options))) file_id.write('{0}\n'.format(' --in={0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' --out={0}'.format(os.path.basename(current_run_dir)))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('download_lineage_data')) file_id.write('{0}\n'.format('run_busco_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(get_busco_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def execute(self, event=None): ''' Execute the list the result logs in the cluster. ''' # validate inputs OK = self.validate_inputs() if not OK: message = 'Some input values are not OK.' tkinter.messagebox.showerror('{0} - {1}'.format(xlib.get_project_name(), self.head), message) # get the run dictionary of the experiment if OK: # -- command = 'ls {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()) command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()) (OK, stdout, stderr) = xssh.execute_cluster_command(self.ssh_client, command) if OK: result_dataset_dict = {} for line in stdout: line = line.rstrip('\n') if line != 'lost+found': result_dataset_id = line try: pattern = r'^(.+)\-(.+)\-(.+)$' mo = re.search(pattern, result_dataset_id) bioinfo_app_code = mo.group(1).strip() yymmdd = mo.group(2) hhmmss = mo.group(3) date = '20{0}-{1}-{2}'.format(yymmdd[:2], yymmdd[2:4], yymmdd[4:]) time = '{0}:{1}:{2}'.format(hhmmss[:2], hhmmss[2:4], hhmmss[4:]) except: bioinfo_app_code = 'xxx' date = '0000-00-00' time = '00:00:00' if result_dataset_id.startswith(xlib.get_bedtools_code()+'-'): bioinfo_app_name = xlib.get_bedtools_name() elif result_dataset_id.startswith(xlib.get_blastplus_code()+'-'): bioinfo_app_name = xlib.get_blastplus_name() elif result_dataset_id.startswith(xlib.get_bowtie2_code()+'-'): bioinfo_app_name = xlib.get_bowtie2_name() elif result_dataset_id.startswith(xlib.get_busco_code()+'-'): bioinfo_app_name = xlib.get_busco_name() elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_name() elif result_dataset_id.startswith(xlib.get_cd_hit_est_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_est_name() elif result_dataset_id.startswith(xlib.get_detonate_code()+'-'): bioinfo_app_name = xlib.get_detonate_name() elif result_dataset_id.startswith(xlib.get_emboss_code()+'-'): bioinfo_app_name = xlib.get_emboss_name() elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'): bioinfo_app_name = xlib.get_fastqc_name() elif result_dataset_id.startswith(xlib.get_gmap_code()+'-'): bioinfo_app_name = xlib.get_gmap_name() elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code()+'-'): bioinfo_app_name = xlib.get_gmap_gsnap_name() elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'): bioinfo_app_name = xlib.get_gzip_name() elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'): bioinfo_app_name = xlib.get_insilico_read_normalization_name() elif result_dataset_id.startswith(xlib.get_miniconda3_code()+'-'): bioinfo_app_name = xlib.get_miniconda3_name() elif result_dataset_id.startswith(xlib.get_ngshelper_code()+'-'): bioinfo_app_name = xlib.get_ngshelper_name() elif result_dataset_id.startswith(xlib.get_quast_code()+'-'): bioinfo_app_name = xlib.get_quast_name() elif result_dataset_id.startswith(xlib.get_r_code()+'-'): bioinfo_app_name = xlib.get_r_name() elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'): bioinfo_app_name = xlib.get_ref_eval_name() elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'): bioinfo_app_name = xlib.get_rnaquast_name() elif result_dataset_id.startswith(xlib.get_rsem_code()+'-'): bioinfo_app_name = xlib.get_rsem_name() elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'): bioinfo_app_name = xlib.get_rsem_eval_name() elif result_dataset_id.startswith(xlib.get_samtools_code()+'-'): bioinfo_app_name = xlib.get_samtools_name() elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'): bioinfo_app_name = xlib.get_soapdenovotrans_name() elif result_dataset_id.startswith(xlib.get_star_code()+'-'): bioinfo_app_name = xlib.get_star_name() elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'): bioinfo_app_name = xlib.get_transabyss_name() elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'): bioinfo_app_name = xlib.get_transcript_filter_name() elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'): bioinfo_app_name = xlib.get_transcriptome_blastx_name() elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'): bioinfo_app_name = xlib.get_transrate_name() elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'): bioinfo_app_name = xlib.get_trimmomatic_name() elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'): bioinfo_app_name = xlib.get_trinity_name() else: bioinfo_app_name = 'xxx' result_dataset_dict[result_dataset_id] = {'experiment_id': self.wrapper_experiment_id.get(), 'result_dataset_id': result_dataset_id, 'bioinfo_app': bioinfo_app_name, 'date': date, 'time': time} # verify if there are any nodes running if OK: if result_dataset_dict == {}: message = 'There is not any run.' tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message) # build the data list if OK: data_list = ['experiment_id', 'result_dataset_id', 'bioinfo_app', 'date', 'time'] # build the data dictionary if OK: data_dict = {} data_dict['experiment_id']= {'text': 'Experiment id. / Process', 'width': 200, 'aligment': 'left'} data_dict['result_dataset_id'] = {'text': 'Result dataset', 'width': 200, 'aligment': 'left'} data_dict['bioinfo_app'] = {'text': 'Bioinfo app / Utility', 'width': 200, 'aligment': 'left'} data_dict['date'] = {'text': 'Date', 'width': 80, 'aligment': 'right'} data_dict['time'] = {'text': 'Time', 'width': 80, 'aligment': 'right'} # create the dialog Table to show the nodes running if OK: dialog_table = gdialogs.DialogTable(self, 'Experiment runs in {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()), 400, 900, data_list, data_dict, result_dataset_dict, 'view_result_logs', [self.wrapper_cluster_name.get()]) self.wait_window(dialog_table) # close the form if OK: self.close()
def build_infrastructure_software_installation_script(cluster_name): ''' Build the infrastructure software installation script. ''' # initialize the control variable and the error list OK = True error_list = [] # get infrastructure software installation script path infrastructure_software_installation_script = get_infrastructure_software_installation_script( ) # write the infrastructure software installation script try: if not os.path.exists( os.path.dirname(infrastructure_software_installation_script)): os.makedirs( os.path.dirname(infrastructure_software_installation_script)) with open(infrastructure_software_installation_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write( '{0}\n'.format('export DEBIAN_FRONTEND=noninteractive')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function fix_source_list')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Fixing file /etc/apt/sources.list ..."')) file_id.write('{0}\n'.format( ' sed -i "s/us-east-1.ec2.archive.ubuntu.com/old-releases.ubuntu.com/g" /etc/apt/sources.list' )) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi')) file_id.write('{0}\n'.format( ' sed -i "s/security.ubuntu.com/old-releases.ubuntu\.com/g" /etc/apt/sources.list' )) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi')) file_id.write('{0}\n'.format(' apt-get update')) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi')) file_id.write('{0}\n'.format(' echo')) file_id.write('{0}\n'.format(' echo "The file is fixed."')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function install_libtbb2')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Installing the package libtbb2 ..."')) file_id.write('{0}\n'.format(' echo')) file_id.write( '{0}\n'.format(' apt-get --assume-yes install libtbb2')) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi')) file_id.write('{0}\n'.format(' echo')) file_id.write( '{0}\n'.format(' echo "The package is installed."')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function install_mailutils')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Installing the package mailutils ..."')) file_id.write('{0}\n'.format(' echo')) file_id.write( '{0}\n'.format(' HOST_IP=`curl checkip.amazonaws.com`')) file_id.write( '{0}\n'.format(' HOST_IP2=`echo "${HOST_IP//./-}"`')) file_id.write('{0}\n'.format( ' HOST_ADDRESS="ec2-${HOST_IP2}-compute-1.amazonaws.com"')) file_id.write('{0}\n'.format( ' echo "HOST_IP: $HOST_IP HOST_ADDRESS: $HOST_ADDRESS"')) file_id.write('{0}\n'.format( ' debconf-set-selections <<< "postfix postfix/mailname string $HOST_ADDRESS"' )) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error debconf-set-selections $RC; fi' )) file_id.write('{0}\n'.format( ' debconf-set-selections <<< "postfix postfix/main_mailer_type string \'Internet Site\'"' )) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error debconf-set-selections $RC; fi' )) file_id.write( '{0}\n'.format(' apt-get --assume-yes install mailutils')) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi')) file_id.write('{0}\n'.format(' echo')) file_id.write( '{0}\n'.format(' echo "The package is installed."')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: Infrastructure software installation"'. format(xlib.get_project_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The infrastructure software installation in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION).<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: Infrastructure software installation"'. format(xlib.get_project_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The infrastructure software installation in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION).<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('fix_source_list')) file_id.write('{0}\n'.format('install_libtbb2')) file_id.write('{0}\n'.format('install_mailutils')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( infrastructure_software_installation_script)) OK = False # return the control variable and the error list return (OK, error_list)
def build_quast_process_script(cluster_name, current_run_dir): ''' Build the current QUAST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the QUAST option dictionary quast_option_dict = xlib.get_option_dict(get_quast_config_file()) # get the options experiment_id = quast_option_dict['identification']['experiment_id'] reference_dataset_id = quast_option_dict['identification']['reference_dataset_id'] reference_file = quast_option_dict['identification']['reference_file'] assembly_software = quast_option_dict['identification']['assembly_software'] assembly_dataset_id = quast_option_dict['identification']['assembly_dataset_id'] assembly_type = quast_option_dict['identification']['assembly_type'] threads = quast_option_dict['QUAST parameters']['threads'] # set the reference file path if reference_dataset_id.upper() != 'NONE': reference_file_path = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # get the QUAST process script name quast_process_script = get_quast_process_script() # write the QUAST process script try: if not os.path.exists(os.path.dirname(quast_process_script)): os.makedirs(os.path.dirname(quast_process_script)) with open(quast_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('QUAST_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$QUAST_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_quast_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' quast.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' quast.py \\')) file_id.write('{0}\n'.format(' --threads {0} \\'.format(threads))) file_id.write('{0}\n'.format(' --output-dir {0} \\'.format(current_run_dir))) if reference_dataset_id.upper() != 'NONE': file_id.write('{0}\n'.format(' -R {0} \\'.format(reference_file_path))) if assembly_type.upper() == 'SCAFFOLDS': file_id.write('{0}\n'.format(' --scaffolds \\')) file_id.write('{0}\n'.format(' {0}'.format(transcriptome_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error quast.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_quast_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(quast_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def build_gzip_process_script(cluster_name, dataset_type, current_run_dir): ''' Build the current gzip process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the gzip option dictionary gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type)) # get the options experiment_id = gzip_option_dict['identification']['experiment_id'] dataset_type_2 = gzip_option_dict['identification']['dataset_type'] dataset_id = gzip_option_dict['identification']['dataset_id'] action = gzip_option_dict['gzip parameters']['action'] # get the sections list sections_list = [] for section in gzip_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the dataset subdirectory and file name lists dataset_subdirectory_list = [] file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): dataset_subdirectory = gzip_option_dict[section][ 'dataset_subdirectory'] dataset_subdirectory_list.append(dataset_subdirectory) file_name = gzip_option_dict[section]['file_name'] file_name_list.append(file_name) # get the dataset directory if dataset_type_2 == 'reference': dataset_dir = xlib.get_cluster_reference_dataset_dir(dataset_id) elif dataset_type_2 == 'database': dataset_dir = xlib.get_cluster_database_dataset_dir(dataset_id) elif dataset_type_2 == 'read': dataset_dir = xlib.get_cluster_experiment_read_dataset_dir( experiment_id, dataset_id) elif dataset_type_2 == 'result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir( experiment_id, dataset_id) elif dataset_type_2 == 'whole-result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir( experiment_id, dataset_id) # write the gzip process script try: if not os.path.exists( os.path.dirname(get_gzip_process_script(dataset_type_2))): os.makedirs( os.path.dirname(get_gzip_process_script(dataset_type_2))) with open(get_gzip_process_script(dataset_type_2), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_gzip_process')) file_id.write('{0}\n'.format('{')) if dataset_type_2 in ['reference', 'database', 'read', 'result']: file_id.write('{0}\n'.format( ' cd {0}'.format(current_run_dir))) for i in range(len(dataset_subdirectory_list)): file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Compressing/decompressing {0}/{1}/{2} ..."'. format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) if action == 'compress': file_id.write('{0}\n'.format( ' gzip {0}/{1}/{2}'.format( dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) elif action == 'decompress': file_id.write('{0}\n'.format( ' gzip --decompress {0}/{1}/{2}'.format( dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error gzip $RC; fi') ) elif dataset_type_2 == 'whole-result': file_id.write('{0}\n'.format( ' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Compressing/decompressing {0} ..."'.format( dataset_dir))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) if action == 'compress': file_id.write('{0}\n'.format( ' tar --create --gzip --verbose --file={0}.tar.gz {0}' .format(dataset_dir))) elif action == 'decompress': file_id.write('{0}\n'.format( ' tar --extract --gzip --verbose --file={0} --directory=/' .format(dataset_dir))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Removing {0} ..."'.format(dataset_dir))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format( ' rm -rf {0}'.format(dataset_dir))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error rm $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_gzip_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_gzip_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_gzip_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_gzip_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_gzip_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_gzip_process_script(dataset_type_2))) OK = False # return the control variable and the error list return (OK, error_list)