def create_quast_config_file(experiment_id='exp001', reference_dataset_id='NONE', reference_file='NONE', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'): ''' Create QUAST config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the app if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_star_code()): assembly_software = xlib.get_star_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the QUAST config file and write the default options try: if not os.path.exists(os.path.dirname(get_quast_config_file())): os.makedirs(os.path.dirname(get_quast_config_file())) with open(get_quast_config_file(), mode='w', encoding='utf8') as file_id: file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir()))) file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir()))) file_id.write('{0}\n'.format('# The experiment_id, reference_dataset_id, reference_file and assembly_dataset_id are fixed in the identification section.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# You can consult the parameters of QUAST and their meaning in http://quast.sourceforge.net/quast.html.')) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information identifies the experiment.')) file_id.write('{0}\n'.format('[identification]')) file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification')) file_id.write('{0:<50} {1}\n'.format('reference_dataset_id = {0}'.format(reference_dataset_id), '# reference dataset identification or NONE')) file_id.write('{0:<50} {1}\n'.format('reference_file = {0}'.format(reference_file), '# reference file name or NONE')) file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name()))) file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification')) file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information to set the QUAST parameters')) file_id.write('{0}\n'.format('[QUAST parameters]')) file_id.write('{0:<50} {1}\n'.format('threads = 2', '# number of threads for use')) except: error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_quast_config_file())) OK = False # return the control variable and the error list return (OK, error_list)
def get_assembly_software_code_list(): ''' Get the code list of "assembly_software". ''' return [ xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_ggtrinity_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code() ]
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # set the output file path if OK: output_file = f'{current_run_dir}/clustered-transcriptome.fasta' # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_cd_hit_est_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_cd_hit_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Filtering transcriptome ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format()}" \\\n') script_file_id.write(' cd-hit-est \\\n') script_file_id.write(f' -T {threads} \\\n') script_file_id.write(f' -M {memory_limit} \\\n') script_file_id.write(f' -i {transcriptome_file} \\\n') script_file_id.write( f' -c {seq_identity_threshold} \\\n') script_file_id.write(f' -n {word_length} \\\n') script_file_id.write(f' -mask {mask} \\\n') script_file_id.write(f' -match {match} \\\n') script_file_id.write(f' -mismatch {mismatch} \\\n') if other_parameters.upper() == 'NONE': script_file_id.write(f' -o {output_file}\n') else: script_file_id.write(f' -o {output_file} \\\n') parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: script_file_id.write( f' -{parameter_name} {parameter_value} \\\n' ) else: script_file_id.write( f' -{parameter_name} {parameter_value}\n' ) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): script_file_id.write( f' -{parameter_name} \\\n') else: script_file_id.write( f' -{parameter_name}\n') i += 1 script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi\n') script_file_id.write(' echo "The transcriptome is filtered."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_cd_hit_est_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('run_cd_hit_est_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_cd_hit_est_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def check_cd_hit_est_config_file(strict): ''' check the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(threads, minimum=0): error_list.append( '*** ERROR: the key "threads" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(memory_limit, minimum=0): error_list.append( '*** ERROR: the key "memory_limit" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_float( seq_identity_threshold, minimum=0., maximum=1.): error_list.append( '*** ERROR: the key "seq_identity_threshold" has to be a float number between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(word_length, minimum=1): error_list.append( '*** ERROR: the key "word_length" has to be an integer number greater than or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found) if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(match): error_list.append( '*** ERROR: the key "match" has to be an integer number.') OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(mismatch): error_list.append( '*** ERROR: the key "mismatch" has to be an integer number.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_cd_hit_est_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def execute(self, event=None): ''' Execute the list the result logs in the cluster. ''' # validate inputs OK = self.validate_inputs() if not OK: message = 'Some input values are not OK.' tkinter.messagebox.showerror('{0} - {1}'.format(xlib.get_project_name(), self.head), message) # get the run dictionary of the experiment if OK: # -- command = 'ls {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()) command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()) (OK, stdout, stderr) = xssh.execute_cluster_command(self.ssh_client, command) if OK: result_dataset_dict = {} for line in stdout: line = line.rstrip('\n') if line != 'lost+found': result_dataset_id = line try: pattern = r'^(.+)\-(.+)\-(.+)$' mo = re.search(pattern, result_dataset_id) bioinfo_app_code = mo.group(1).strip() yymmdd = mo.group(2) hhmmss = mo.group(3) date = '20{0}-{1}-{2}'.format(yymmdd[:2], yymmdd[2:4], yymmdd[4:]) time = '{0}:{1}:{2}'.format(hhmmss[:2], hhmmss[2:4], hhmmss[4:]) except: bioinfo_app_code = 'xxx' date = '0000-00-00' time = '00:00:00' if result_dataset_id.startswith(xlib.get_bedtools_code()+'-'): bioinfo_app_name = xlib.get_bedtools_name() elif result_dataset_id.startswith(xlib.get_blastplus_code()+'-'): bioinfo_app_name = xlib.get_blastplus_name() elif result_dataset_id.startswith(xlib.get_bowtie2_code()+'-'): bioinfo_app_name = xlib.get_bowtie2_name() elif result_dataset_id.startswith(xlib.get_busco_code()+'-'): bioinfo_app_name = xlib.get_busco_name() elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_name() elif result_dataset_id.startswith(xlib.get_cd_hit_est_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_est_name() elif result_dataset_id.startswith(xlib.get_detonate_code()+'-'): bioinfo_app_name = xlib.get_detonate_name() elif result_dataset_id.startswith(xlib.get_emboss_code()+'-'): bioinfo_app_name = xlib.get_emboss_name() elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'): bioinfo_app_name = xlib.get_fastqc_name() elif result_dataset_id.startswith(xlib.get_gmap_code()+'-'): bioinfo_app_name = xlib.get_gmap_name() elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code()+'-'): bioinfo_app_name = xlib.get_gmap_gsnap_name() elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'): bioinfo_app_name = xlib.get_gzip_name() elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'): bioinfo_app_name = xlib.get_insilico_read_normalization_name() elif result_dataset_id.startswith(xlib.get_miniconda3_code()+'-'): bioinfo_app_name = xlib.get_miniconda3_name() elif result_dataset_id.startswith(xlib.get_ngshelper_code()+'-'): bioinfo_app_name = xlib.get_ngshelper_name() elif result_dataset_id.startswith(xlib.get_quast_code()+'-'): bioinfo_app_name = xlib.get_quast_name() elif result_dataset_id.startswith(xlib.get_r_code()+'-'): bioinfo_app_name = xlib.get_r_name() elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'): bioinfo_app_name = xlib.get_ref_eval_name() elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'): bioinfo_app_name = xlib.get_rnaquast_name() elif result_dataset_id.startswith(xlib.get_rsem_code()+'-'): bioinfo_app_name = xlib.get_rsem_name() elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'): bioinfo_app_name = xlib.get_rsem_eval_name() elif result_dataset_id.startswith(xlib.get_samtools_code()+'-'): bioinfo_app_name = xlib.get_samtools_name() elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'): bioinfo_app_name = xlib.get_soapdenovotrans_name() elif result_dataset_id.startswith(xlib.get_star_code()+'-'): bioinfo_app_name = xlib.get_star_name() elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'): bioinfo_app_name = xlib.get_transabyss_name() elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'): bioinfo_app_name = xlib.get_transcript_filter_name() elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'): bioinfo_app_name = xlib.get_transcriptome_blastx_name() elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'): bioinfo_app_name = xlib.get_transrate_name() elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'): bioinfo_app_name = xlib.get_trimmomatic_name() elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'): bioinfo_app_name = xlib.get_trinity_name() else: bioinfo_app_name = 'xxx' result_dataset_dict[result_dataset_id] = {'experiment_id': self.wrapper_experiment_id.get(), 'result_dataset_id': result_dataset_id, 'bioinfo_app': bioinfo_app_name, 'date': date, 'time': time} # verify if there are any nodes running if OK: if result_dataset_dict == {}: message = 'There is not any run.' tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message) # build the data list if OK: data_list = ['experiment_id', 'result_dataset_id', 'bioinfo_app', 'date', 'time'] # build the data dictionary if OK: data_dict = {} data_dict['experiment_id']= {'text': 'Experiment id. / Process', 'width': 200, 'aligment': 'left'} data_dict['result_dataset_id'] = {'text': 'Result dataset', 'width': 200, 'aligment': 'left'} data_dict['bioinfo_app'] = {'text': 'Bioinfo app / Utility', 'width': 200, 'aligment': 'left'} data_dict['date'] = {'text': 'Date', 'width': 80, 'aligment': 'right'} data_dict['time'] = {'text': 'Time', 'width': 80, 'aligment': 'right'} # create the dialog Table to show the nodes running if OK: dialog_table = gdialogs.DialogTable(self, 'Experiment runs in {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()), 400, 900, data_list, data_dict, result_dataset_dict, 'view_result_logs', [self.wrapper_cluster_name.get()]) self.wait_window(dialog_table) # close the form if OK: self.close()
def create_gmap_config_file(experiment_id='exp001', reference_dataset_id='NONE', reference_file='NONE', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'): ''' Create GMAP config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the app if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_star_code()): assembly_software = xlib.get_star_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the GMAP config file and write the default options try: if not os.path.exists(os.path.dirname(get_gmap_config_file())): os.makedirs(os.path.dirname(get_gmap_config_file())) with open(get_gmap_config_file(), mode='w', encoding='utf8') as file_id: file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir()))) file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir()))) file_id.write('{0}\n'.format('# The experiment_id, reference_dataset_id, reference_file and assembly_dataset_id are fixed in the identification section.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# You can consult the parameters of GMAP and their meaning in http://research-pub.gene.com/gmap/.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# In section "GMAP parameters", the key "other_parameters" allows you to input additional parameters in the format:')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# other_parameters = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# parameter-i is a parameter name of GMAP and value-i a valid value of parameter-i, e.g.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# other_parameters = --no-chimeras; --canonical-mode=2')) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information identifies the experiment.')) file_id.write('{0}\n'.format('[identification]')) file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification')) file_id.write('{0:<50} {1}\n'.format('reference_dataset_id = {0}'.format(reference_dataset_id), '# reference dataset identification or NONE')) file_id.write('{0:<50} {1}\n'.format('reference_file = {0}'.format(reference_file), '# reference file name or NONE')) file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name()))) file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification')) file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information to set the GMAP parameters')) file_id.write('{0}\n'.format('[GMAP parameters]')) file_id.write('{0:<50} {1}\n'.format('threads = 2', '# number of threads for use')) file_id.write('{0:<50} {1}\n'.format('kmer = NONE', '# kmer size to use in genome database or NONE (the program will find the highest available kmer size in the genome database)')) file_id.write('{0:<50} {1}\n'.format('sampling = NONE', '# Sampling to use in genome database or NONE (the program will find the smallest available sampling value in the genome database within selected k-mer size)')) file_id.write('{0:<50} {1}\n'.format('input-buffer-size = 1000', '# size of input buffer')) file_id.write('{0:<50} {1}\n'.format('output-buffer-size = 1000', '# size of buffer size in queries for output thread')) file_id.write('{0:<50} {1}\n'.format('prunelevel = 0', '# pruning level: 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive)')) file_id.write('{0:<50} {1}\n'.format('format = COMPRESS', '# format for output: COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS')) file_id.write('{0:<50} {1}\n'.format('other_parameters = NONE', '# additional parameters to the previous ones or NONE')) except: error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_gmap_config_file())) OK = False # return the control variable and the error list return (OK, error_list)
def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = busco_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data_url = busco_option_dict['BUSCO parameters'][ 'lineage_data_url'] mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters'][ 'augustus_options'].upper() # get the file and name from the lineage data url lineage_data_file = lineage_data_url.split("/")[-1] # -- lineage_data = lineage_data_file[:lineage_data_file.find('.tar.gz')] point_pos = lineage_data_file.find('.') lineage_data = lineage_data_file[:point_pos] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function download_lineage_data\n') script_file_id.write('{\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Downloading lineage data ..."\n') download_script = f'import requests; r = requests.get(\'{lineage_data_url}\') ; open(\'{lineage_data_file}\' , \'wb\').write(r.content)' script_file_id.write( f' $MINICONDA3_BIN_PATH/python3 -c "{download_script}"\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error download_script $RC; fi\n' ) script_file_id.write(f' tar -xzvf ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n') script_file_id.write(f' rm ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error rm $RC; fi\n') script_file_id.write(' echo "Lineage data are downloaded."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_busco_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_busco_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Assessing the transcriptome quality ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format(separator=False)}" \\\n' ) script_file_id.write(' busco \\\n') script_file_id.write(f' --cpu={ncpu} \\\n') script_file_id.write( f' --lineage_dataset=./{lineage_data} \\\n') script_file_id.write(f' --mode={mode} \\\n') script_file_id.write(f' --evalue={evalue} \\\n') script_file_id.write(f' --limit={limit} \\\n') if species.upper() != 'NONE': script_file_id.write(f' --species={species} \\\n') if long == 'YES': script_file_id.write(' --long \\\n') if augustus_options.upper() != 'NONE': script_file_id.write( f' --august_options="{augustus_options}" \\\n') script_file_id.write(f' --in={transcriptome_file} \\\n') script_file_id.write( f' --out={os.path.basename(current_run_dir)}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi\n' ) script_file_id.write(' echo "The assessment is done."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_busco_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('download_lineage_data\n') script_file_id.write('run_busco_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_busco_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def create_busco_config_file(experiment_id='exp001', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'): ''' Create BUSCO config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the assembly software if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_ggtrinity_code()): assembly_software = xlib.get_ggtrinity_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the BUSCO config file and write the default options try: if not os.path.exists(os.path.dirname(get_busco_config_file())): os.makedirs(os.path.dirname(get_busco_config_file())) with open(get_busco_config_file(), mode='w', encoding='iso-8859-1', newline='\n') as file_id: file_id.write( '# You must review the information of this file and update the values with the corresponding ones to the current run.\n' ) file_id.write('#\n') file_id.write( f'# The reference file has to be located in the cluster directory {xlib.get_cluster_reference_dir()}/experiment_id/reference_dataset_id\n' ) file_id.write( f'# The assembly files have to be located in the cluster directory {xlib.get_cluster_result_dir()}/experiment_id/assembly_dataset_id\n' ) file_id.write( '# The experiment_id and assembly_dataset_id names are fixed in the identification section.\n' ) file_id.write('#\n') file_id.write( '# In section "BUSCO parameters", the key "augustus_options" allows you to input additional August parameters in the format:\n' ) file_id.write('#\n') file_id.write( '# augustus_options = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]\n' ) file_id.write('#\n') file_id.write( '# parameter-i is a parameter name of Augustus and value-i a valid value of parameter-i, e.g.\n' ) file_id.write('#\n') file_id.write( '# augustus_options = --translation_table=6 --progress=true\n' ) file_id.write('#\n') file_id.write( '# You can consult the parameters of BUSCO and their meaning in "http://busco.ezlab.org/"\n' ) file_id.write( '# and the ones of August in "http://bioinf.uni-greifswald.de/augustus/".\n' ) file_id.write('\n') file_id.write( '# This section has the information identifies the experiment.\n' ) file_id.write('[identification]\n') file_id.write('{0:<50} {1}\n'.format( f'experiment_id = {experiment_id}', '# experiment identification')) file_id.write('{0:<50} {1}\n'.format( f'assembly_software = {assembly_software}', f'# assembly software: {get_assembly_software_code_list_text()}' )) file_id.write('{0:<50} {1}\n'.format( f'assembly_dataset_id = {assembly_dataset_id}', '# assembly dataset identification')) file_id.write('{0:<50} {1}\n'.format( f'assembly_type = {assembly_type}', f'# assembly type: CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()}; NONE in any other case' )) file_id.write('\n') file_id.write( '# This section has the information to set the BUSCO parameters\n' ) file_id.write('[BUSCO parameters]\n') file_id.write('{0:<50} {1}\n'.format( 'ncpu = 4', '# number of threads/cores for use')) file_id.write('{0:<50} {1}\n'.format( 'lineage_data_url = https://busco-data.ezlab.org/v4/data/lineages/viridiplantae_odb10.2020-09-10.tar.gz', '# the url of lineage data file that will be used')) file_id.write('{0:<50} {1}\n'.format( 'mode = TRAN', f'# mode: {get_mode_code_list_text()}')) file_id.write('{0:<50} {1}\n'.format( 'evalue = 1E-03', '# E-value cutoff for BLAST searches')) file_id.write('{0:<50} {1}\n'.format( 'limit = 3', '# number of candidate regions to consider')) file_id.write('{0:<50} {1}\n'.format( 'species = NONE', '# identifier of existing Augustus species gene finding parameters or NONE' )) file_id.write('{0:<50} {1}\n'.format( 'long = NO', f'# Augustus optimization mode for self-training: {get_long_code_list_text()}' )) file_id.write('{0:<50} {1}\n'.format( 'augustus_options = NONE', '# additional parameters to August or NONE')) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_busco_config_file()} can not be recreated' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) # set the output file path if OK: output_file = '{0}/clustered-transcriptome.fasta'.format( current_run_dir) # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'CDHIT_PATH={0}/{1}/envs/{2}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$CDHIT_PATH:$PATH')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format( xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_cd_hit_est_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Running {0} process ..."'.format( xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format(' cd-hit-est \\')) file_id.write('{0}\n'.format( ' -T {0} \\'.format(threads))) file_id.write('{0}\n'.format( ' -M {0} \\'.format(memory_limit))) file_id.write('{0}\n'.format( ' -i {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format( ' -c {0} \\'.format(seq_identity_threshold))) file_id.write('{0}\n'.format( ' -n {0} \\'.format(word_length))) file_id.write('{0}\n'.format( ' -mask {0} \\'.format(mask))) file_id.write('{0}\n'.format( ' -match {0} \\'.format(match))) file_id.write('{0}\n'.format( ' -mismatch {0} \\'.format(mismatch))) if other_parameters.upper() == 'NONE': file_id.write('{0}\n'.format( ' -o {0}'.format(output_file))) else: file_id.write('{0}\n'.format( ' -o {0} \\'.format(output_file))) parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: file_id.write('{0}\n'.format( ' -{0} {1} \\'.format( parameter_name, parameter_value))) else: file_id.write('{0}\n'.format( ' -{0} {1}'.format( parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): file_id.write('{0}\n'.format( ' -{0} \\'.format(parameter_name))) else: file_id.write('{0}\n'.format( ' -{0}'.format(parameter_name))) i += 1 file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_cd_hit_est_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_cd_hit_est_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def validate_quast_config_file(strict): ''' Validate the QUAST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: quast_option_dict = xlib.get_option_dict(get_quast_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in quast_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = quast_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = quast_option_dict.get('identification', {}).get('reference_dataset_id', not_found) if reference_dataset_id == not_found: error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".') OK = False # check section "identification" - key "reference_file" reference_file = quast_option_dict.get('identification', {}).get('reference_file', not_found) if reference_file == not_found: error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".') OK = False # check section "identification" - key "assembly_software" assembly_software = quast_option_dict.get('identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = quast_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_type" assembly_type = quast_option_dict.get('identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) OK = False # check section "QUAST parameters" if 'QUAST parameters' not in sections_list: error_list.append('*** ERROR: the section "QUAST parameters" is not found.') OK = False else: # check section "QUAST parameters" - key "threads" threads = quast_option_dict.get('QUAST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append('*** ERROR: the key "threads" is not found in the section "QUAST parameters".') OK = False else: try: if int(threads) < 1: error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.') OK = False except: error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.') OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_quast_name())) # return the control variable and the error list return (OK, error_list)
def form_list_cluster_experiment_processes(): ''' List the processes of an experiment in the cluster. ''' # initialize the control variable OK = True # print the header clib.clear_screen() clib.print_headers_with_environment('Logs - List experiment processes in the cluster') # get the cluster name print(xlib.get_separator()) if xec2.get_running_cluster_list(volume_creator_included=False) != []: cluster_name = cinputs.input_cluster_name(volume_creator_included=False, help=True) else: print('WARNING: There is not any running cluster.') OK = False # create the SSH client connection if OK: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master') for error in error_list: log.write('{0}\n'.format(error)) # get experiment identification if OK: experiment_id = cinputs.input_experiment_id(ssh_client, help=True) if experiment_id == '': print('WARNING: The cluster {0} has not experiment data.'.format(cluster_name)) OK = False # get the result dataset list of the experiment if OK: command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(xlib.get_cluster_result_dir(), experiment_id) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: result_dataset_id_list = [] for line in stdout: line = line.rstrip('\n') if line != 'lost+found': result_dataset_id_list.append(line) # print the result dataset identification list of the experiment if OK: print(xlib.get_separator()) if result_dataset_id_list == []: print('*** WARNING: There is not any result dataset of the experiment {0}.'.format(experiment_id)) else: result_dataset_id_list.sort() # set data width result_dataset_width = 25 bioinfo_app_width = 25 # set line template line_template = '{0:' + str(result_dataset_width) + '} {1:' + str(bioinfo_app_width) + '}' # print header print(line_template.format('Result dataset', 'Bioinfo app / Utility')) print(line_template.format('=' * result_dataset_width, '=' * bioinfo_app_width)) # print detail lines for result_dataset_id in result_dataset_id_list: if result_dataset_id.startswith(xlib.get_bedtools_code()+'-'): bioinfo_app_name = xlib.get_bedtools_name() elif result_dataset_id.startswith(xlib.get_blastplus_code()+'-'): bioinfo_app_name = xlib.get_blastplus_name() elif result_dataset_id.startswith(xlib.get_bowtie2_code()+'-'): bioinfo_app_name = xlib.get_bowtie2_name() elif result_dataset_id.startswith(xlib.get_busco_code()+'-'): bioinfo_app_name = xlib.get_busco_name() elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_est_name() elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'): bioinfo_app_name = xlib.get_cd_hit_est_name() elif result_dataset_id.startswith(xlib.get_detonate_code()+'-'): bioinfo_app_name = xlib.get_detonate_name() elif result_dataset_id.startswith(xlib.get_emboss_code()+'-'): bioinfo_app_name = xlib.get_emboss_name() elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'): bioinfo_app_name = xlib.get_fastqc_name() elif result_dataset_id.startswith(xlib.get_gmap_code()+'-'): bioinfo_app_name = xlib.get_gmap_name() elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code()+'-'): bioinfo_app_name = xlib.get_gmap_gsnap_name() elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'): bioinfo_app_name = xlib.get_gzip_name() elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'): bioinfo_app_name = xlib.get_insilico_read_normalization_name() elif result_dataset_id.startswith(xlib.get_miniconda3_code()+'-'): bioinfo_app_name = xlib.get_miniconda3_name() elif result_dataset_id.startswith(xlib.get_ngshelper_code()+'-'): bioinfo_app_name = xlib.get_ngshelper_name() elif result_dataset_id.startswith(xlib.get_quast_code()+'-'): bioinfo_app_name = xlib.get_quast_name() elif result_dataset_id.startswith(xlib.get_r_code()+'-'): bioinfo_app_name = xlib.get_r_name() elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'): bioinfo_app_name = xlib.get_ref_eval_name() elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'): bioinfo_app_name = xlib.get_rnaquast_name() elif result_dataset_id.startswith(xlib.get_rsem_code()+'-'): bioinfo_app_name = xlib.get_rsem_name() elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'): bioinfo_app_name = xlib.get_rsem_eval_name() elif result_dataset_id.startswith(xlib.get_samtools_code()+'-'): bioinfo_app_name = xlib.get_samtools_name() elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'): bioinfo_app_name = xlib.get_soapdenovotrans_name() elif result_dataset_id.startswith(xlib.get_star_code()+'-'): bioinfo_app_name = xlib.get_star_name() elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'): bioinfo_app_name = xlib.get_transabyss_name() elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'): bioinfo_app_name = xlib.get_transcript_filter_name() elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'): bioinfo_app_name = xlib.get_transcriptome_blastx_name() elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'): bioinfo_app_name = xlib.get_transrate_name() elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'): bioinfo_app_name = xlib.get_trimmomatic_name() elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'): bioinfo_app_name = xlib.get_trinity_name() else: bioinfo_app_name = 'xxx' print(line_template.format(result_dataset_id, bioinfo_app_name)) # close the SSH client connection if OK: xssh.close_ssh_client_connection(ssh_client) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification']['assembly_software'] assembly_dataset_id = busco_option_dict['identification']['assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data = busco_option_dict['BUSCO parameters']['lineage_data'] lineage_data_file = '{0}.tar.gz'.format(lineage_data) lineage_data_url = 'http://busco.ezlab.org/v2/datasets/{0}'.format(lineage_data_file) mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters']['augustus_options'].upper() # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('BUSCO_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('export PATH=$BUSCO_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function download_lineage_data')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Downloading lineage data ..."')) file_id.write('{0}\n'.format(' wget --quiet --output-document ./{0} {1}'.format(lineage_data_file, lineage_data_url))) file_id.write('{0}\n'.format(' tar -xzvf ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format(' rm ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_busco_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' run_BUSCO.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' run_BUSCO.py \\')) file_id.write('{0}\n'.format(' --cpu={0} \\'.format(ncpu))) file_id.write('{0}\n'.format(' --lineage_path=./{0} \\'.format(lineage_data))) file_id.write('{0}\n'.format(' --mode={0} \\'.format(mode))) file_id.write('{0}\n'.format(' --evalue={0} \\'.format(evalue))) file_id.write('{0}\n'.format(' --limit={0} \\'.format(limit))) if species.upper() != 'NONE': file_id.write('{0}\n'.format(' --species={0} \\'.format(species))) if long == 'YES': file_id.write('{0}\n'.format(' --long \\')) if augustus_options.upper() != 'NONE': file_id.write('{0}\n'.format(" --august_options='{0}' \\".format(augustus_options))) file_id.write('{0}\n'.format(' --in={0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' --out={0}'.format(os.path.basename(current_run_dir)))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('download_lineage_data')) file_id.write('{0}\n'.format('run_busco_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(get_busco_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def create_busco_config_file(experiment_id='exp001', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'): ''' Create BUSCO config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the assembly software if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_star_code()): assembly_software = xlib.get_star_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the BUSCO config file and write the default options try: if not os.path.exists(os.path.dirname(get_busco_config_file())): os.makedirs(os.path.dirname(get_busco_config_file())) with open(get_busco_config_file(), mode='w', encoding='utf8') as file_id: file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir()))) file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir()))) file_id.write('{0}\n'.format('# The experiment_id and assembly_dataset_id names are fixed in the identification section.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# In section "BUSCO parameters", the key "augustus_options" allows you to input additional August parameters in the format:')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# augustus_options = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# parameter-i is a parameter name of Augustus and value-i a valid value of parameter-i, e.g.')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# augustus_options = --translation_table=6 --progress=true)')) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format('# You can consult the parameters of BUSCO and their meaning in http://busco.ezlab.org/.')) file_id.write('{0}\n'.format('# and August ones in http://bioinf.uni-greifswald.de/augustus/.')) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information identifies the experiment.')) file_id.write('{0}\n'.format('[identification]')) file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification')) file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name()))) file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification')) file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format('# This section has the information to set the BUSCO parameters')) file_id.write('{0}\n'.format('[BUSCO parameters]')) file_id.write('{0:<50} {1}\n'.format('ncpu = 2', '# number of threads/cores for use')) file_id.write('{0:<50} {1}\n'.format('lineage_data = embryophyta_odb9', '# value to find the lineage data url in BUSCO web (e.g. embryophyta -> http://busco.ezlab.org/v2/datasets/embryophyta_odb9.tar.gz)')) file_id.write('{0:<50} {1}\n'.format('mode = tran', '# geno (genome assemblies, DNA) or tran (transcriptome assemblies, DNA) or prot (annotated gene sets, proteins)')) file_id.write('{0:<50} {1}\n'.format('evalue = 1e-03', '# E-value cutoff for BLAST searches')) file_id.write('{0:<50} {1}\n'.format('limit = 3', '# number of candidate regions to consider')) file_id.write('{0:<50} {1}\n'.format('species = NONE', '# identifier of existing Augustus species gene finding parameters or NONE')) file_id.write('{0:<50} {1}\n'.format('long = NO', '# Augustus optimization mode for self-training: YES or NO')) file_id.write('{0:<50} {1}\n'.format('augustus_options = NONE', '# additional parameters to August or NONE')) except: error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_busco_config_file())) OK = False # return the control variable and the error list return (OK, error_list)
def validate_busco_config_file(strict): ''' Validate the BUSCO config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: busco_option_dict = xlib.get_option_dict(get_busco_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in busco_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = busco_option_dict.get('identification', {}).get('experiment_id', not_found) is_experiment_id_OK = True if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') is_experiment_id_OK = False OK = False # check section "identification" - key "assembly_software" assembly_software = busco_option_dict.get('identification', {}).get('assembly_software', not_found) is_assembly_software_OK = True if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') is_assembly_software_OK = False OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) is_assembly_software_OK = False OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = busco_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) is_assembly_dataset_id_OK = True if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') is_assembly_dataset_id_OK = False OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) is_assembly_dataset_id_OK = False OK = False # check section "identification" - key "assembly_type" assembly_type = busco_option_dict.get('identification', {}).get('assembly_type', not_found) is_assembly_type_OK = True if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) is_assembly_type_OK = False OK = False # check section "BUSCO parameters" if 'BUSCO parameters' not in sections_list: error_list.append('*** ERROR: the section "BUSCO parameters" is not found.') OK = False else: # check section "BUSCO parameters" - key "ncpu" ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found) is_ncpu_OK = True if ncpu == not_found: error_list.append('*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".') is_ncpu_OK = False OK = False else: try: if int(ncpu) < 1: error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_ncpu_OK = False OK = False except: error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_ncpu_OK = False OK = False # check section "BUSCO parameters" - key "lineage_data" lineage_data = busco_option_dict.get('BUSCO parameters', {}).get('lineage_data', not_found) is_lineage_data_OK = True if lineage_data == not_found: error_list.append('*** ERROR: the key "lineage_data" is not found in the section "BUSCO parameters"') is_lineage_data_OK = False OK = False # check section "BUSCO parameters" - key "mode" mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found).lower() is_mode_OK = True if mode == not_found: error_list.append('*** ERROR: the key "mode" is not found in the section "BUSCO parameters".') is_mode_OK = False OK = False elif mode not in ['geno', 'tran', 'prot']: error_list.append('*** ERROR: the key "mode" value in the section "BUSCO parameters" must be geno or tran or prot.') is_mode_OK = False OK = False # check section "BUSCO parameters" - key "evalue" evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found) is_evalue_OK = True if evalue == not_found: error_list.append('*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".') is_evalue_OK = False OK = False else: try: if float(evalue) <= 0: error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.') is_evalue_OK = False OK = False except: error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.') is_evalue_OK = False OK = False # check section "BUSCO parameters" - key "limit" limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found) is_limit_OK = True if limit == not_found: error_list.append('*** ERROR: the key "limit" is not found in the section "BUSCO parameters".') OK = False else: try: if int(limit) < 1: error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_limit_OK = False OK = False except: error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_limit_OK = False OK = False # check section "BUSCO parameters" - key "species" species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found) is_species_OK = True if species == not_found: error_list.append('*** ERROR: the key "species" is not found in the section "BUSCO parameters"') is_species_OK = False OK = False # check section "BUSCO parameters" - key "long" long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found).upper() is_long_OK = True if long == not_found: error_list.append('*** ERROR: the key "long" is not found in the section "BUSCO parameters".') is_long_OK = False OK = False elif long not in ['YES', 'NO']: error_list.append('*** ERROR: the key "long" value in the section "BUSCO parameters" must be YES or NO.') is_long_OK = False OK = False # check section "BUSCO parameters" - key "augustus_options" augustus_options = busco_option_dict.get('BUSCO parameters', {}).get('augustus_options', not_found) is_augustus_options_OK = True if augustus_options == not_found: error_list.append('*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".') is_augustus_options_OK = False OK = False else: if augustus_options.upper() != 'NONE': parameter_list = [x.strip() for x in augustus_options.split(';')] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append('*** ERROR: the value of the key "augustus_options" in the section "BUSCO parameters" must be NONE or a valid August parameter list.') is_augustus_options_OK = False OK = False break # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_busco_name())) # return the control variable and the error list return (OK, error_list)
def create_cd_hit_est_config_file(experiment_id='exp001', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'): ''' Create CD-HIT-EST config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the assembly software if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_star_code()): assembly_software = xlib.get_star_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the CD-HIT-EST config file and write the default options try: if not os.path.exists(os.path.dirname(get_cd_hit_est_config_file())): os.makedirs(os.path.dirname(get_cd_hit_est_config_file())) with open(get_cd_hit_est_config_file(), mode='w', encoding='utf8') as file_id: file_id.write('{0}\n'.format( '# You must review the information of this file and update the values with the corresponding ones to the current run.' )) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format( '# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id' .format(xlib.get_cluster_result_dir()))) file_id.write('{0}\n'.format( '# The experiment_id and assembly_dataset_id names are fixed in the identification section.' )) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format( '# You can consult the parameters of CD-HIT-EST (CD-HIT package) and their meaning in http://weizhong-lab.ucsd.edu/cd-hit/.' )) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format( '# In section "CD-HIT-EST parameters", the key "other_parameters" allows you to input additional parameters in the format:' )) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format( '# other_parameters = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]' )) file_id.write('{0}\n'.format('#')) file_id.write('{0}\n'.format( '# parameter-i is a parameter name of CD-HIT-EST and value-i a valid value of parameter-i, e.g.' )) file_id.write('{0}\n'.format('#')) file_id.write( '{0}\n'.format('# other_parameters = --aS=0.9; --U=10')) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format( '# This section has the information identifies the assembly result dataset.' )) file_id.write('{0}\n'.format('[identification]')) file_id.write('{0:<50} {1}\n'.format( 'experiment_id = {0}'.format(experiment_id), '# experiment identification')) file_id.write('{0:<50} {1}\n'.format( 'assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})' .format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name()))) file_id.write('{0:<50} {1}\n'.format( 'assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification')) file_id.write('{0:<50} {1}\n'.format( 'assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}' .format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))) file_id.write('{0}\n'.format('')) file_id.write('{0}\n'.format( '# This section has the information to set the CD-HIT-EST parameters' )) file_id.write('{0}\n'.format('[CD-HIT-EST parameters]')) file_id.write('{0:<50} {1}\n'.format( 'threads = 2', '# number of threads for use; with 0, all CPUs will be used')) file_id.write('{0:<50} {1}\n'.format( 'memory_limit = 800', '# memory limit (in MB) for the program; 0 for unlimitted')) file_id.write('{0:<50} {1}\n'.format( 'seq_identity_threshold = 0.9', '# sequence identity threshold')) file_id.write('{0:<50} {1}\n'.format('word_length = 5', '# word length')) file_id.write('{0:<50} {1}\n'.format( 'mask = NX', '# masking letters (e.g. -mask NX, to mask out both "N" and "X")' )) file_id.write('{0:<50} {1}\n'.format( 'match = 2', '# matching score (1 for T-U and N-N)')) file_id.write('{0:<50} {1}\n'.format('mismatch = -2', '# mismatching score')) file_id.write('{0:<50} {1}\n'.format( 'other_parameters = NONE', '# additional parameters to the previous ones or NONE')) except: error_list.append( '*** ERROR: The file {0} can not be recreated'.format( get_cd_hit_est_config_file())) OK = False # return the control variable and the error list return (OK, error_list)
def validate_cd_hit_est_config_file(strict): ''' Validate the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif assembly_software not in [ xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code() ]: error_list.append( '*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.' .format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code() ) and not assembly_dataset_id.startswith(xlib.get_transabyss_code( )) and not assembly_dataset_id.startswith(xlib.get_trinity_code( )) and not assembly_dataset_id.startswith(xlib.get_star_code( )) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code( )) and not assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): error_list.append( '*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.' .format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append( '*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.' .format(xlib.get_soapdenovotrans_name())) OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code( )) or assembly_dataset_id.startswith(xlib.get_trinity_code( )) or assembly_dataset_id.startswith( xlib.get_star_code()) or assembly_dataset_id.startswith( xlib.get_cd_hit_est_code( )) or assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append( '*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.' .format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(threads) < 0: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(memory_limit) < 0: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if float(seq_identity_threshold) < 0.0 or float( seq_identity_threshold) > 1.0: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False except: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(word_length) < 1: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False except: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found).upper() if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(match) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(mismatch) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: if other_parameters.upper() != 'NONE': parameter_list = [ x.strip() for x in other_parameters.split(';') ] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append( '*** ERROR: the value of the key "other_parameters" in the section "CD-HIT-EST parameters" must be NONE or a valid parameter list.' ) OK = False break if parameter_name in not_allowed_parameters_list: error_list.append( '*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "CD-HIT-EST parameters" because it is controled by {1}.' .format(parameter_name, xlib.get_project_name())) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append( '\nThe {0} config file is not valid. Please, correct this file or recreate it.' .format(xlib.get_cd_hit_est_name())) # return the control variable and the error list return (OK, error_list)
def build_quast_process_script(cluster_name, current_run_dir): ''' Build the current QUAST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the QUAST option dictionary quast_option_dict = xlib.get_option_dict(get_quast_config_file()) # get the options experiment_id = quast_option_dict['identification']['experiment_id'] reference_dataset_id = quast_option_dict['identification']['reference_dataset_id'] reference_file = quast_option_dict['identification']['reference_file'] assembly_software = quast_option_dict['identification']['assembly_software'] assembly_dataset_id = quast_option_dict['identification']['assembly_dataset_id'] assembly_type = quast_option_dict['identification']['assembly_type'] threads = quast_option_dict['QUAST parameters']['threads'] # set the reference file path if reference_dataset_id.upper() != 'NONE': reference_file_path = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # get the QUAST process script name quast_process_script = get_quast_process_script() # write the QUAST process script try: if not os.path.exists(os.path.dirname(quast_process_script)): os.makedirs(os.path.dirname(quast_process_script)) with open(quast_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('QUAST_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$QUAST_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_quast_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' quast.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' quast.py \\')) file_id.write('{0}\n'.format(' --threads {0} \\'.format(threads))) file_id.write('{0}\n'.format(' --output-dir {0} \\'.format(current_run_dir))) if reference_dataset_id.upper() != 'NONE': file_id.write('{0}\n'.format(' -R {0} \\'.format(reference_file_path))) if assembly_type.upper() == 'SCAFFOLDS': file_id.write('{0}\n'.format(' --scaffolds \\')) file_id.write('{0}\n'.format(' {0}'.format(transcriptome_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error quast.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_quast_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(quast_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def get_result_dataset_dict(cluster_name, experiment_id, status, passed_connection, ssh_client): ''' Get a dictionary with the result datasets of an experiment in the cluster. ''' # initialize the control variable and the error list OK = True error_list = [] # get the result directory in the cluster cluster_result_dir = xlib.get_cluster_result_dir() # initialize the dictionary of the result datasets result_dataset_dict = {} # create the SSH client connection if not passed_connection: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master') # verify the result directory is created if OK: command = '[ -d {0} ] && echo RC=0 || echo RC=1'.format(cluster_result_dir) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if stdout[len(stdout) - 1] != 'RC=0': error_list.append('*** ERROR: There is not any volume mounted in the result directory.\n') error_list.append('You must link a volume in the mounting point {0} for the template {1}.\n'.format(cluster_result_dir, cluster_name)) OK = False # get the dictionary of the result datasets if OK: if status == 'uncompressed': command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(cluster_result_dir, experiment_id) elif status == 'compressed': command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^d > /dev/null && echo $list; done;'.format(cluster_result_dir, experiment_id) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: if status == 'uncompressed': input_pattern = '{0}-(.+)-(.+)' output_pattern = '{0} ({1} {2})' elif status == 'compressed': input_pattern = '{0}-(.+)-(.+).tar.gz' output_pattern = '{0} ({1} {2}) [compressed]' for line in stdout: line = line.rstrip('\n') if line != 'lost+found': result_dataset_id = line if result_dataset_id.startswith(xlib.get_cd_hit_est_code()+'-'): mo = re.match(input_pattern.format(xlib.get_cd_hit_est_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_cd_hit_est_name(), date, time) elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'): mo = re.match(input_pattern.format(xlib.get_fastqc_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_fastqc_name(), date, time) elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'): mo = re.match(input_pattern.format(xlib.get_gzip_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_gzip_name(), date, time) elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'): mo = re.match(input_pattern.format(xlib.get_insilico_read_normalization_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_insilico_read_normalization_name(), date, time) elif result_dataset_id.startswith(xlib.get_quast_code()+'-'): mo = re.match(input_pattern.format(xlib.get_quast_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_quast_name(), date, time) elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'): mo = re.match(input_pattern.format(xlib.get_ref_eval_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_ref_eval_name(), date, time) elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'): mo = re.match(input_pattern.format(xlib.get_rnaquast_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_rnaquast_name(), date, time) elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'): mo = re.match(input_pattern.format(xlib.get_rsem_eval_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_rsem_eval_name(), date, time) elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'): mo = re.match(input_pattern.format(xlib.get_soapdenovotrans_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_soapdenovotrans_name(), date, time) elif result_dataset_id.startswith(xlib.get_star_code()+'-'): mo = re.match(input_pattern.format(xlib.get_star_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_star_name(), date, time) elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'): mo = re.match(input_pattern.format(xlib.get_transabyss_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_transabyss_name(), date, time) elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'): mo = re.match(input_pattern.format(xlib.get_transcript_filter_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_transcript_filter_name(), date, time) elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'): mo = re.match(input_pattern.format(xlib.get_transcriptome_blastx_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_transcriptome_blastx_name(), date, time) elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'): mo = re.match(input_pattern.format(xlib.get_transrate_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_transrate_name(), date, time) elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'): mo = re.match(input_pattern.format(xlib.get_trimmomatic_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_trimmomatic_name(), date, time) elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'): mo = re.match(input_pattern.format(xlib.get_trinity_code()), result_dataset_id) date = mo.group(1) time = mo.group(2) result_dataset_name = output_pattern.format(xlib.get_trinity_name(), date, time) else: result_dataset_name = result_dataset_id result_dataset_dict[result_dataset_id] = {'result_dataset_id': result_dataset_id, 'result_dataset_name': result_dataset_name} # close the SSH client connection if OK and not passed_connection: xssh.close_ssh_client_connection(ssh_client) # return the control variable, error list and dictionary of the result datasets return (OK, error_list, result_dataset_dict)
def check_express_config_file(strict): ''' Check the eXpress config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: express_option_dict = xlib.get_option_dict(get_express_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The option dictionary could not be built from the config file') OK = False else: # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = express_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "assembly_software" assembly_software = express_option_dict.get('identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = express_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_type" assembly_type = express_option_dict.get('identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append(f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.') OK = False # check section "alignment-dataset-1" if 'alignment-dataset-1' not in sections_list: error_list.append('*** ERROR: the section "alignment-dataset-1" is not found.') OK = False # check all sections "alignment-dataset-n" for section in sections_list: if section not in ['identification', 'eXpress parameters']: # check than the section identification is like alignment-dataset-n if not re.match('^alignment-dataset-[0-9]+$', section): error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.') OK = False else: # check section "alignment-dataset-n" - key "alignment_software" alignment_software = express_option_dict.get(section, {}).get('alignment_software', not_found) if alignment_software == not_found: error_list.append(f'*** ERROR: the key "alignment_software" is not found in the section "{section}".') OK = False elif not xlib.check_code(alignment_software, get_alignment_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.') OK = False # check section "alignment-dataset-n" - key "alignment_dataset_id" alignment_dataset_id = express_option_dict.get(section, {}).get('alignment_dataset_id', not_found) if alignment_dataset_id == not_found: error_list.append(f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".') OK = False elif not xlib.check_startswith(alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.') OK = False # check section "eXpress parameters" if 'eXpress parameters' not in sections_list: error_list.append('*** ERROR: the section "eXpress parameters" is not found.') OK = False else: # check section "express parameters" - key "frag-len-mean" frag_len_mean = express_option_dict.get('eXpress parameters', {}).get('frag-len-mean', not_found) if frag_len_mean == not_found: error_list.append('*** ERROR: the key "frag-len-mean" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_mean, minimum=1): error_list.append('*** ERROR: the key "frag-len-mean" has to be an integer number greater than or equal to 1.') OK = False # check section "express parameters" - key "frag-len-stddev" frag_len_stddev = express_option_dict.get('eXpress parameters', {}).get('frag-len-stddev', not_found) if frag_len_stddev == not_found: error_list.append('*** ERROR: the key "frag-len-stddev" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_stddev, minimum=1): error_list.append('*** ERROR: the key "frag-len-stddev" has to be an integer number greater than or equal to 1.') OK = False # check section "eXpress parameters" - key "library_type" library_type = express_option_dict.get('eXpress parameters', {}).get('library_type', not_found) if library_type == not_found: error_list.append('*** ERROR: the key "library_type" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(library_type, get_library_type_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "library_type" has to be {get_library_type_code_list_text()}.') OK = False # check section "eXpress parameters" - key "max-indel-size" max_indel_size = express_option_dict.get('eXpress parameters', {}).get('max-indel-size', not_found) if max_indel_size == not_found: error_list.append('*** ERROR: the key "max-indel-size" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(max_indel_size, minimum=0): error_list.append('*** ERROR: the key "max-indel-size" has to be an integer number greater than or equal to 0.') OK = False # check section "eXpress parameters" - key "no-bias-correct" no_bias_correct = express_option_dict.get('eXpress parameters', {}).get('no-bias-correct', not_found) if no_bias_correct == not_found: error_list.append('*** ERROR: the key "no-bias-correct" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_bias_correct, get_no_bias_correct_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-bias-correct" has to be {get_no_bias_correct_code_list_text()}.') OK = False # check section "eXpress parameters" - key "no-error-model" no_error_model = express_option_dict.get('eXpress parameters', {}).get('no-error-model', not_found) if no_error_model == not_found: error_list.append('*** ERROR: the key "no-error-model" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_error_model, get_no_error_model_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-error-model" has to be {get_no_error_model_code_list_text()}.') OK = False # check section "eXpress parameters" - key "other_parameters" not_allowed_parameters_list = ['no-update-check', 'frag-len-mean', 'frag-len-stddev', 'max-indel-size', 'fr-stranded', 'rf-stranded', 'f-stranded', 'r-stranded', 'no-bias-correct', 'no-error-model', 'output-dir'] other_parameters = express_option_dict.get('eXpress parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "eXpress parameters".') OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append(f'\nThe {xlib.get_express_name()} config file is not valid. Please, correct this file or recreate it.') # return the control variable and the error list return (OK, error_list)
def check_busco_config_file(strict): ''' Check the BUSCO config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: busco_option_dict = xlib.get_option_dict(get_busco_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in busco_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = busco_option_dict.get('identification', {}).get( 'experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = busco_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = busco_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = busco_option_dict.get('identification', {}).get( 'assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "BUSCO parameters" if 'BUSCO parameters' not in sections_list: error_list.append( '*** ERROR: the section "BUSCO parameters" is not found.') OK = False else: # check section "BUSCO parameters" - key "ncpu" ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found) if ncpu == not_found: error_list.append( '*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(ncpu, minimum=1): error_list.append( '*** ERROR: the key "ncpu" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "lineage_data_url" lineage_data_url = busco_option_dict.get( 'BUSCO parameters', {}).get('lineage_data_url', not_found) if lineage_data_url == not_found: error_list.append( '*** ERROR: the key "lineage_data_url" is not found in the section "BUSCO parameters"' ) OK = False else: try: urllib.request.urlopen(lineage_data_url) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: the key "lineage_data_url" has to be a reachable address.' ) OK = False # check section "BUSCO parameters" - key "mode" mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found) if mode == not_found: error_list.append( '*** ERROR: the key "mode" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( mode, get_mode_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "evalue" evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found) if evalue == not_found: error_list.append( '*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_float(evalue, minimum=0., mne=1E-12): error_list.append( '*** ERROR: the key "evalue" has to be a float number greater than 0.' ) OK = False # check section "BUSCO parameters" - key "limit" limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found) if limit == not_found: error_list.append( '*** ERROR: the key "limit" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(limit, minimum=1): error_list.append( '*** ERROR: the key "limit" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "species" species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found) if species == not_found: error_list.append( '*** ERROR: the key "species" is not found in the section "BUSCO parameters"' ) OK = False # check section "BUSCO parameters" - key "long" long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found) if long == not_found: error_list.append( '*** ERROR: the key "long" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( long, get_long_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "long" has to be {get_long_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "augustus_options" augustus_options = busco_option_dict.get( 'BUSCO parameters', {}).get('augustus_options', not_found) if augustus_options == not_found: error_list.append( '*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".' ) OK = False elif augustus_options.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( augustus_options, "augustus_options", []) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_busco_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def create_express_config_file(experiment_id='exp001', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS', alignment_dataset_id_list=['bowtie2-170101-235959']): ''' Create eXpress config file with the default options. It is necessary update the options in each run. ''' # initialize the control variable and the error list OK = True error_list = [] # set the assembly software if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): assembly_software = xlib.get_soapdenovotrans_code() elif assembly_dataset_id.startswith(xlib.get_transabyss_code()): assembly_software = xlib.get_transabyss_code() elif assembly_dataset_id.startswith(xlib.get_trinity_code()): assembly_software = xlib.get_trinity_code() elif assembly_dataset_id.startswith(xlib.get_ggtrinity_code()): assembly_software = xlib.get_ggtrinity_code() elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()): assembly_software = xlib.get_cd_hit_est_code() elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): assembly_software = xlib.get_transcript_filter_code() # create the eXpress config file and write the default options try: if not os.path.exists(os.path.dirname(get_express_config_file())): os.makedirs(os.path.dirname(get_express_config_file())) with open(get_express_config_file(), mode='w', encoding='iso-8859-1', newline='\n') as file_id: file_id.write( '# You must review the information of this file and update the values with the corresponding ones to the current run.\n') file_id.write( '#\n') file_id.write(f'# The assembly files have to be located in the cluster directory {xlib.get_cluster_result_dir()}/experiment_id/assembly_dataset_id\n') file_id.write(f'# The alignment file has to be located in the cluster directory {xlib.get_cluster_result_dir()}/experiment_id/alignment_dataset_id\n') file_id.write( '# The experiment_id, assembly_dataset_id and alignment_dataset_id are fixed in the identification section.\n') file_id.write( '#\n') file_id.write( '# You can consult the parameters of eXpress and their meaning in "https://pachterlab.github.io/eXpress/".\n') file_id.write( '#\n') file_id.write( '# In section "eXpress parameters", the key "other_parameters" allows you to input additional parameters in the format:\n') file_id.write( '#\n') file_id.write( '# other_parameters = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]\n') file_id.write( '#\n') file_id.write( '# parameter-i is a parameter name of Cufflinks and value-i a valid value of parameter-i, e.g.\n') file_id.write( '#\n') file_id.write( '# other_parameters = --calc-covar; --forget-param=0.6\n') file_id.write( '\n') file_id.write( '# This section has the information identifies the experiment.\n') file_id.write( '[identification]\n') file_id.write( '{0:<50} {1}\n'.format(f'experiment_id = {experiment_id}', '# experiment identification')) file_id.write( '{0:<50} {1}\n'.format(f'assembly_software = {assembly_software}', f'# assembly software: {get_assembly_software_code_list_text()}')) file_id.write( '{0:<50} {1}\n'.format(f'assembly_dataset_id = {assembly_dataset_id}', '# assembly dataset identification')) file_id.write( '{0:<50} {1}\n'.format(f'assembly_type = {assembly_type}', f'# assembly type: CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()}; NONE in any other case')) for i in range(len(alignment_dataset_id_list)): # set the alignment software alignment_dataset_id = alignment_dataset_id_list[i] if alignment_dataset_id.startswith(xlib.get_bowtie2_code()): alignment_software = xlib.get_bowtie2_code() elif alignment_dataset_id.startswith(xlib.get_gsnap_code()): alignment_software = xlib.get_gsnap_code() # write the alignment dataset section file_id.write( '\n') if i == 0: file_id.write( '# This section has the information of the first alignment dataset.\n') file_id.write(f'[alignment-dataset-{i + 1}]\n') file_id.write( '{0:<50} {1}\n'.format(f'alignment_software = {alignment_software}', f'# alignment software: {get_alignment_software_code_list_text()}')) file_id.write( '{0:<50} {1}\n'.format(f'alignment_dataset_id = {alignment_dataset_id}', '# alignment dataset identification')) if i == 0: file_id.write( '\n') file_id.write( '# If there are more alignment datasets, you have to repeat the section alignment-dataset-1 with the data of each dataset.\n') file_id.write( '# The section identification has to be alignment-dataset-n (n is an integer not repeated)\n') file_id.write( '\n') file_id.write( '# This section has the information to set the eXpress parameters\n') file_id.write( '[eXpress parameters]\n') file_id.write( '{0:<50} {1}\n'.format( 'frag-len-mean = 200', '# mean fragment length')) file_id.write( '{0:<50} {1}\n'.format( 'frag-len-stddev = 60', '# fragment length standard deviation')) file_id.write( '{0:<50} {1}\n'.format( 'library_type = NONE', f'# library type: {get_library_type_code_list_text()}')) file_id.write( '{0:<50} {1}\n'.format( 'max-indel-size = 0', '# maximum allowed size of a single indel')) file_id.write( '{0:<50} {1}\n'.format( 'no-bias-correct = NO', f'# if YES, eXpress will not measure and account for sequence-specific biases: {get_no_bias_correct_code_list_text()}')) file_id.write( '{0:<50} {1}\n'.format( 'no-error-model = NO', f'# if YES, eXpress will not measure and account for errors in alignments: {get_no_error_model_code_list_text()}')) file_id.write( '{0:<50} {1}\n'.format( 'other_parameters = NONE', '# additional parameters to the previous ones or NONE')) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append(f'*** ERROR: The file {get_express_config_file()} can not be recreated') OK = False # return the control variable and the error list return (OK, error_list)
def validate_gmap_config_file(strict): ''' Validate the GMAP config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in gmap_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = gmap_option_dict.get('identification', {}).get('experiment_id', not_found) is_experiment_id_OK = True if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') is_experiment_id_OK = False OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = gmap_option_dict.get('identification', {}).get('reference_dataset_id', not_found) is_reference_dataset_id_OK = True if reference_dataset_id == not_found: error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".') is_reference_dataset_id_OK = False OK = False # check section "identification" - key "reference_file" reference_file = gmap_option_dict.get('identification', {}).get('reference_file', not_found) is_reference_file_OK = True if reference_file == not_found: error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".') is_reference_file_OK = False OK = False # check section "identification" - key "assembly_software" assembly_software = gmap_option_dict.get('identification', {}).get('assembly_software', not_found) is_assembly_software_OK = True if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') is_assembly_software_OK = False OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) is_assembly_software_OK = False OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = gmap_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) is_assembly_dataset_id_OK = True if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') is_assembly_dataset_id_OK = False OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) is_assembly_dataset_id_OK = False OK = False # check section "identification" - key "assembly_type" assembly_type = gmap_option_dict.get('identification', {}).get('assembly_type', not_found) is_assembly_type_OK = True if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) is_assembly_type_OK = False OK = False # check section "GMAP parameters" if 'GMAP parameters' not in sections_list: error_list.append('*** ERROR: the section "GMAP parameters" is not found.') OK = False else: # check section "GMAP parameters" - key "threads" threads = gmap_option_dict.get('GMAP parameters', {}).get('threads', not_found) is_threads_OK = True if threads == not_found: error_list.append('*** ERROR: the key "threads" is not found in the section "GMAP parameters".') is_threads_OK = False OK = False else: try: if int(threads) < 1: error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_threads_OK = False OK = False except: error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_threads_OK = False OK = False # check section "GMAP parameters" - key "kmer" kmer = gmap_option_dict.get('GMAP parameters', {}).get('kmer', not_found) is_kmer_OK = True if kmer == not_found: error_list.append('*** ERROR: the key "kmer" is not found in the section "GMAP parameters".') is_kmer_OK = False OK = False else: try: if kmer.upper() != 'NONE' and (int(kmer) < 1 or int(kmer) > 16): error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.') is_kmer_OK = False OK = False except: error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.') is_kmer_OK = False OK = False # check section "GMAP parameters" - key "sampling" sampling = gmap_option_dict.get('GMAP parameters', {}).get('sampling', not_found) is_sampling_OK = True if sampling == not_found: error_list.append('*** ERROR: the key "sampling" is not found in the section "GMAP parameters".') is_sampling_OK = False OK = False else: try: if sampling.upper() != 'NONE' and int(sampling) < 1: error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.') is_sampling_OK = False OK = False except: error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.') is_sampling_OK = False OK = False # check section "GMAP parameters" - key "input-buffer-size" input_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('input-buffer-size', not_found) is_input_buffer_size_OK = True if input_buffer_size == not_found: error_list.append('*** ERROR: the key "input-buffer-size" is not found in the section "GMAP parameters".') is_input_buffer_size_OK = False OK = False else: try: if int(input_buffer_size) < 1: error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_input_buffer_size_OK = False OK = False except: error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_input_buffer_size_OK = False OK = False # check section "GMAP parameters" - key "output-buffer-size" output_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('output-buffer-size', not_found) is_output_buffer_size_OK = True if output_buffer_size == not_found: error_list.append('*** ERROR: the key "output-buffer-size" is not found in the section "GMAP parameters".') is_output_buffer_size_OK = False OK = False else: try: if int(output_buffer_size) < 1: error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_output_buffer_size_OK = False OK = False except: error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_output_buffer_size_OK = False OK = False # check section "GMAP parameters" - key "prunelevel" prunelevel = gmap_option_dict.get('GMAP parameters', {}).get('prunelevel', not_found) is_prunelevel_OK = True if prunelevel == not_found: error_list.append('*** ERROR: the key "prunelevel" is not found in the section "GMAP parameters".') is_prunelevel_OK = False OK = False else: if prunelevel not in ['0', '1', '2', '3']: error_list.append('*** ERROR: the key "prunelevel" in the section "GMAP parameters" must be 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive).') is_prunelevel_OK = False OK = False # check section "GMAP parameters" - key "format" format = gmap_option_dict.get('GMAP parameters', {}).get('format', not_found) is_format_OK = True if format == not_found: error_list.append('*** ERROR: the key "format" is not found in the section "GMAP parameters".') is_format_OK = False OK = False else: if format.upper() not in ['COMPRESS', 'SUMMARY', 'ALIGN', 'PLS', 'GFF3_GENE', 'SPLICESITES', 'INTRONS', 'MAP_EXONS', 'MAP_RANGES', 'COORDS']: error_list.append('*** ERROR: the key "format" in the section "GMAP parameters" must be COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS.') is_format_OK = False OK = False # check section "GMAP parameters" - key "other_parameters" not_allowed_parameters_list = ['nthreads', 'kmer', 'sampling', 'input-buffer-size', 'output-buffer-size', 'prunelevel', 'compress', 'summary', 'align', 'format' ] other_parameters = gmap_option_dict.get('GMAP parameters', {}).get('other_parameters', not_found) is_other_parameters_OK = True if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "GMAP parameters".') is_other_parameters_OK = False OK = False else: if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append('*** ERROR: the value of the key "other_parameters" in the section "GMAP parameters" must be NONE or a valid parameter list.') is_other_parameters_OK = False OK = False break else: if parameter_name in not_allowed_parameters_list: error_list.append('*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "GMAP parameters" because it is controled by NGScloud.'.format(parameter_name)) is_other_parameters_OK = False OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gmap_name())) # return the control variable and the error list return (OK, error_list)
def build_express_process_script(cluster_name, current_run_dir): ''' Build the current eXpress process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the eXpress option dictionary express_option_dict = xlib.get_option_dict(get_express_config_file()) # get the options experiment_id = express_option_dict['identification']['experiment_id'] assembly_software = express_option_dict['identification']['assembly_software'] assembly_dataset_id = express_option_dict['identification']['assembly_dataset_id'] assembly_type = express_option_dict['identification']['assembly_type'] frag_len_mean = express_option_dict['eXpress parameters']['frag-len-mean'] frag_len_stddev = express_option_dict['eXpress parameters']['frag-len-stddev'] library_type = express_option_dict['eXpress parameters']['library_type'] max_indel_size = express_option_dict['eXpress parameters']['max-indel-size'] no_bias_correct = express_option_dict['eXpress parameters']['no-bias-correct'] no_error_model = express_option_dict['eXpress parameters']['no-error-model'] other_parameters = express_option_dict['eXpress parameters']['other_parameters'] # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # build alignment dataset identification list alignment_software_list = [] alignment_dataset_id_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^alignment-dataset-[0-9]+$', section): alignment_software_list.append(express_option_dict[section]['alignment_software']) alignment_dataset_id_list.append(express_option_dict[section]['alignment_dataset_id']) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the eXpress process script try: if not os.path.exists(os.path.dirname(get_express_process_script())): os.makedirs(os.path.dirname(get_express_process_script())) with open(get_express_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n') script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function run_express_process\n') script_file_id.write( '{\n') script_file_id.write(f' source activate {xlib.get_express_anaconda_code()}\n') script_file_id.write(f' cd $CURRENT_DIR\n') for i in range(len(alignment_dataset_id_list)): alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam' script_file_id.write(f' SORTED_BAM_LIST={alignment_dataset_id_list[i]}-sorted-bam-files.txt\n') script_file_id.write(f' ls {alignment_files} > $SORTED_BAM_LIST\n') script_file_id.write( ' while read FILE_BAM; do\n') script_file_id.write( ' NAME=`basename $FILE_BAM`\n') script_file_id.write( ' NAME=${NAME:0:-11}\n') script_file_id.write(f' SUBDIR={alignment_dataset_id_list[i]}-$NAME\n') script_file_id.write(f' mkdir --parents $CURRENT_DIR/$SUBDIR\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "Quantitating alignment dataset {alignment_dataset_id_list[i]} - library $SUBDIR ..."\n') script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write(f' --format="{xlib.get_time_output_format(separator=False)}" \\\n') script_file_id.write( ' express \\\n') script_file_id.write( ' --no-update-check \\\n') script_file_id.write(f' --frag-len-mean {frag_len_mean} \\\n') script_file_id.write(f' --frag-len-stddev {frag_len_stddev} \\\n') if library_type.lower() == 'fr-stranded': script_file_id.write( ' --fr-stranded \\\n') elif library_type.lower() == 'rf-stranded': script_file_id.write( ' --rf-stranded \\\n') elif library_type.lower() == 'f-stranded': script_file_id.write( ' --f-stranded \\\n') elif library_type.lower() == 'r-stranded': script_file_id.write( ' --r-stranded \\\n') script_file_id.write(f' --max-indel-size {max_indel_size} \\\n') if no_bias_correct.upper() == 'YES': script_file_id.write( ' --no-bias-correct \\\n') if no_error_model.upper() == 'YES': script_file_id.write( ' --no-error-model \\\n') if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write(f' --{parameter_name}={parameter_value} \\\n') else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() script_file_id.write(f' --{parameter_name} \\\n') script_file_id.write( ' --output-dir $CURRENT_DIR/$SUBDIR \\\n') script_file_id.write(f' {transcriptome_file} \\\n') script_file_id.write( ' $FILE_BAM\n') script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error express $RC; fi\n') script_file_id.write( ' echo "Quantitation is done."\n') script_file_id.write( ' done < $SORTED_BAM_LIST\n') script_file_id.write( ' conda deactivate\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_express_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( 'run_express_process\n') script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append(f'*** ERROR: The file {get_express_process_script()} can not be created') OK = False # return the control variable and the error list return (OK, error_list)
def build_gmap_process_script(cluster_name, current_run_dir): ''' Build the current GMAP process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the GMAP option dictionary gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) # get the options experiment_id = gmap_option_dict['identification']['experiment_id'] reference_dataset_id = gmap_option_dict['identification']['reference_dataset_id'] reference_file = gmap_option_dict['identification']['reference_file'] assembly_software = gmap_option_dict['identification']['assembly_software'] assembly_dataset_id = gmap_option_dict['identification']['assembly_dataset_id'] assembly_type = gmap_option_dict['identification']['assembly_type'] threads = gmap_option_dict['GMAP parameters']['threads'] kmer = gmap_option_dict['GMAP parameters']['kmer'] sampling = gmap_option_dict['GMAP parameters']['sampling'] input_buffer_size = gmap_option_dict['GMAP parameters']['input-buffer-size'] output_buffer_size = gmap_option_dict['GMAP parameters']['output-buffer-size'] prunelevel = gmap_option_dict['GMAP parameters']['prunelevel'] format = gmap_option_dict['GMAP parameters']['format'] other_parameters = gmap_option_dict['GMAP parameters']['other_parameters'] # set the cluster reference dataset directory cluster_reference_dataset_dir = xlib.get_cluster_reference_dataset_dir(reference_dataset_id) # set the cluster reference file cluster_reference_file = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the GMAP database name reference_file_name, reference_file_extension = os.path.splitext(reference_file) gmap_database = '{0}-gmap_database'.format(reference_file_name) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # set the output file path output_file = 'gmap_output_{0}.txt'.format(format.lower()) # get the GMAP process script name gmap_process_script = get_gmap_process_script() # write the GMAP process script try: if not os.path.exists(os.path.dirname(gmap_process_script)): os.makedirs(os.path.dirname(gmap_process_script)) with open(gmap_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('GMAP_GSNAP_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$GMAP_GSNAP_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function build_gmap_database')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap_build \\')) file_id.write('{0}\n'.format(' --dir={0}\\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0}\\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) file_id.write('{0}\n'.format(' {0}'.format(cluster_reference_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_gmap_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' gmap --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap \\')) file_id.write('{0}\n'.format(' --nthreads={0} \\'.format(threads))) file_id.write('{0}\n'.format(' --dir={0} \\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0} \\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) if sampling.upper() != 'NONE': file_id.write('{0}\n'.format(' --sampling={0} \\'.format(sampling))) file_id.write('{0}\n'.format(' --input-buffer-size={0} \\'.format(input_buffer_size))) file_id.write('{0}\n'.format(' --output-buffer-size={0} \\'.format(output_buffer_size))) file_id.write('{0}\n'.format(' --prunelevel={0} \\'.format(prunelevel))) if format.upper() == 'COMPRESS': file_id.write('{0}\n'.format(' --compress \\')) elif format.upper() == 'SUMMARY': file_id.write('{0}\n'.format(' --summary \\')) elif format.upper() == 'ALIGN': file_id.write('{0}\n'.format(' --align \\')) else: file_id.write('{0}\n'.format(' --format={0} \\'.format(format.lower()))) file_id.write('{0}\n'.format(' --ordered \\')) file_id.write('{0}\n'.format(' --nofails \\')) if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() file_id.write('{0}\n'.format(' --{0}={1} \\'.format(parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() file_id.write('{0}\n'.format(' --{0} \\'.format(parameter_name))) file_id.write('{0}\n'.format(' {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' > {0}'.format(output_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error gmap $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('build_gmap_database')) file_id.write('{0}\n'.format('run_gmap_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(gmap_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def form_list_cluster_experiment_processes(): ''' List the processes of an experiment in the cluster. ''' # initialize the control variable OK = True # print the header clib.clear_screen() clib.print_headers_with_environment( 'Logs - List experiment processes in the cluster') # get the cluster name print(xlib.get_separator()) if xec2.get_running_cluster_list(only_environment_cluster=True, volume_creator_included=False) != []: cluster_name = cinputs.input_cluster_name( volume_creator_included=False, help=True) else: print('WARNING: There is not any running cluster.') OK = False # create the SSH client connection if OK: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name) for error in error_list: print(error) # get experiment identification if OK: experiment_id = cinputs.input_experiment_id(ssh_client, help=True) if experiment_id == '': print( f'WARNING: The cluster {cluster_name} does not have experiment data.' ) OK = False # get the result dataset list of the experiment if OK: command = f'cd {xlib.get_cluster_result_dir()}/{experiment_id}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;' (OK, stdout, _) = xssh.execute_cluster_command(ssh_client, command) if OK: result_dataset_id_list = [] for line in stdout: line = line.rstrip('\n') if line != 'lost+found': result_dataset_id_list.append(line) # print the result dataset identification list of the experiment if OK: print(xlib.get_separator()) if result_dataset_id_list == []: print( f'*** WARNING: There is not any result dataset of the experiment {experiment_id}.' ) else: result_dataset_id_list.sort() # set data width result_dataset_width = 30 bioinfo_app_width = 25 # set line line = '{0:' + str(result_dataset_width) + '} {1:' + str( bioinfo_app_width) + '}' # print header print(line.format('Result dataset', 'Bioinfo app / Utility')) print( line.format('=' * result_dataset_width, '=' * bioinfo_app_width)) # print detail lines for result_dataset_id in result_dataset_id_list: if result_dataset_id.startswith(xlib.get_bedtools_code() + '-'): bioinfo_app_name = xlib.get_bedtools_name() elif result_dataset_id.startswith(xlib.get_blastplus_code() + '-'): bioinfo_app_name = xlib.get_blastplus_name() elif result_dataset_id.startswith(xlib.get_bcftools_code() + '-'): bioinfo_app_name = xlib.get_bcftools_name() elif result_dataset_id.startswith(xlib.get_bowtie2_code() + '-'): bioinfo_app_name = xlib.get_bowtie2_name() elif result_dataset_id.startswith(xlib.get_busco_code() + '-'): bioinfo_app_name = xlib.get_busco_name() elif result_dataset_id.startswith(xlib.get_cd_hit_code() + '-'): bioinfo_app_name = xlib.get_cd_hit_name() elif result_dataset_id.startswith(xlib.get_cd_hit_est_code() + '-'): bioinfo_app_name = xlib.get_cd_hit_est_name() elif result_dataset_id.startswith(xlib.get_cuffdiff_code() + '-'): bioinfo_app_name = xlib.get_cuffdiff_name() elif result_dataset_id.startswith(xlib.get_cufflinks_code() + '-'): bioinfo_app_name = xlib.get_cufflinks_name() elif result_dataset_id.startswith( xlib.get_cufflinks_cuffmerge_code() + '-'): bioinfo_app_name = xlib.get_cufflinks_cuffmerge_name() elif result_dataset_id.startswith(xlib.get_cuffnorm_code() + '-'): bioinfo_app_name = xlib.get_cuffnorm_name() elif result_dataset_id.startswith(xlib.get_cuffquant_code() + '-'): bioinfo_app_name = xlib.get_cuffquant_name() elif result_dataset_id.startswith(xlib.get_cutadapt_code() + '-'): bioinfo_app_name = xlib.get_cutadapt_name() elif result_dataset_id.startswith( xlib.get_ddradseq_simulation_code() + '-'): bioinfo_app_name = xlib.get_ddradseq_simulation_name() elif result_dataset_id.startswith( xlib.get_ddradseqtools_code() + '-'): bioinfo_app_name = xlib.get_ddradseqtools_name() elif result_dataset_id.startswith(xlib.get_detonate_code() + '-'): bioinfo_app_name = xlib.get_detonate_name() elif result_dataset_id.startswith(xlib.get_diamond_code() + '-'): bioinfo_app_name = xlib.get_diamond_name() elif result_dataset_id.startswith(xlib.get_emboss_code() + '-'): bioinfo_app_name = xlib.get_emboss_name() elif result_dataset_id.startswith( xlib.get_entrez_direct_code() + '-'): bioinfo_app_name = xlib.get_entrez_direct_name() elif result_dataset_id.startswith(xlib.get_express_code() + '-'): bioinfo_app_name = xlib.get_express_name() elif result_dataset_id.startswith(xlib.get_fastqc_code() + '-'): bioinfo_app_name = xlib.get_fastqc_name() elif result_dataset_id.startswith(xlib.get_ggtrinity_code() + '-'): bioinfo_app_name = xlib.get_ggtrinity_name() elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code() + '-'): bioinfo_app_name = xlib.get_gmap_gsnap_name() elif result_dataset_id.startswith(xlib.get_gmap_code() + '-'): bioinfo_app_name = xlib.get_gmap_name() elif result_dataset_id.startswith(xlib.get_gsnap_code() + '-'): bioinfo_app_name = xlib.get_gsnap_name() elif result_dataset_id.startswith(xlib.get_gzip_code() + '-'): bioinfo_app_name = xlib.get_gzip_name() elif result_dataset_id.startswith(xlib.get_hisat2_code() + '-'): bioinfo_app_name = xlib.get_hisat2_name() elif result_dataset_id.startswith(xlib.get_htseq_code() + '-'): bioinfo_app_name = xlib.get_htseq_name() elif result_dataset_id.startswith(xlib.get_htseq_count_code() + '-'): bioinfo_app_name = xlib.get_htseq_count_name() elif result_dataset_id.startswith( xlib.get_insilico_read_normalization_code() + '-'): bioinfo_app_name = xlib.get_insilico_read_normalization_name( ) elif result_dataset_id.startswith(xlib.get_ipyrad_code() + '-'): bioinfo_app_name = xlib.get_ipyrad_name() elif result_dataset_id.startswith(xlib.get_kallisto_code() + '-'): bioinfo_app_name = xlib.get_kallisto_name() elif result_dataset_id.startswith(xlib.get_miniconda3_code() + '-'): bioinfo_app_name = xlib.get_miniconda3_name() elif result_dataset_id.startswith(xlib.get_ngshelper_code() + '-'): bioinfo_app_name = xlib.get_ngshelper_name() elif result_dataset_id.startswith(xlib.get_quast_code() + '-'): bioinfo_app_name = xlib.get_quast_name() elif result_dataset_id.startswith(xlib.get_r_code() + '-'): bioinfo_app_name = xlib.get_r_name() elif result_dataset_id.startswith(xlib.get_raddesigner_code() + '-'): bioinfo_app_name = xlib.get_raddesigner_name() elif result_dataset_id.startswith(xlib.get_ref_eval_code() + '-'): bioinfo_app_name = xlib.get_ref_eval_name() elif result_dataset_id.startswith(xlib.get_rnaquast_code() + '-'): bioinfo_app_name = xlib.get_rnaquast_name() elif result_dataset_id.startswith(xlib.get_rsem_code() + '-'): bioinfo_app_name = xlib.get_rsem_name() elif result_dataset_id.startswith(xlib.get_rsem_eval_code() + '-'): bioinfo_app_name = xlib.get_rsem_eval_name() elif result_dataset_id.startswith(xlib.get_rsitesearch_code() + '-'): bioinfo_app_name = xlib.get_rsitesearch_name() elif result_dataset_id.startswith(xlib.get_samtools_code() + '-'): bioinfo_app_name = xlib.get_samtools_name() elif result_dataset_id.startswith(xlib.get_soapdenovo2_code() + '-'): bioinfo_app_name = xlib.get_soapdenovo2_name() elif result_dataset_id.startswith( xlib.get_soapdenovotrans_code() + '-'): bioinfo_app_name = xlib.get_soapdenovotrans_name() elif result_dataset_id.startswith(xlib.get_star_code() + '-'): bioinfo_app_name = xlib.get_star_name() elif result_dataset_id.startswith(xlib.get_starcode_code() + '-'): bioinfo_app_name = xlib.get_starcode_name() elif result_dataset_id.startswith(xlib.get_toa_code() + '-'): bioinfo_app_name = xlib.get_toa_name() elif result_dataset_id.startswith( xlib.get_toa_process_download_basic_data_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_basic_data_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_dicots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_dicots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_gene_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_gene_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_go_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_go_name() elif result_dataset_id.startswith( xlib.get_toa_process_download_gymno_01_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_gymno_01_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_interpro_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_interpro_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_monocots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_monocots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_download_taxonomy_code() + '-'): bioinfo_app_name = xlib.get_toa_process_download_taxonomy_name( ) elif result_dataset_id.startswith( xlib. get_toa_process_gilist_viridiplantae_nucleotide_gi_code( ) + '-'): bioinfo_app_name = xlib.get_toa_process_gilist_viridiplantae_nucleotide_gi_name( ) elif result_dataset_id.startswith( xlib. get_toa_process_gilist_viridiplantae_protein_gi_code() + '-'): bioinfo_app_name = xlib.get_toa_process_gilist_viridiplantae_protein_gi_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_load_basic_data_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_basic_data_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_load_dicots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_dicots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_load_gene_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_gene_name() elif result_dataset_id.startswith( xlib.get_toa_process_load_go_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_go_name() elif result_dataset_id.startswith( xlib.get_toa_process_load_gymno_01_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_gymno_01_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_load_interpro_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_interpro_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_load_monocots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_load_monocots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_merge_annotations_code() + '-'): bioinfo_app_name = xlib.get_toa_process_merge_annotations_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_nr_blastplus_db_code() + '-'): bioinfo_app_name = xlib.get_toa_process_nr_blastplus_db_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_nr_diamond_db_code() + '-'): bioinfo_app_name = xlib.get_toa_process_nr_diamond_db_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_nt_blastplus_db_code() + '-'): bioinfo_app_name = xlib.get_toa_process_nt_blastplus_db_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_pipeline_aminoacid_code() + '-'): bioinfo_app_name = xlib.get_toa_process_pipeline_aminoacid_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_pipeline_nucleotide_code() + '-'): bioinfo_app_name = xlib.get_toa_process_pipeline_nucleotide_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_proteome_dicots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_proteome_dicots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_proteome_gymno_01_code() + '-'): bioinfo_app_name = xlib.get_toa_process_proteome_gymno_01_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_proteome_monocots_04_code() + '-'): bioinfo_app_name = xlib.get_toa_process_proteome_monocots_04_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_proteome_refseq_plant_code() + '-'): bioinfo_app_name = xlib.get_toa_process_proteome_refseq_plant_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_rebuild_toa_database_code() + '-'): bioinfo_app_name = xlib.get_get_toa_process_rebuild_toa_database_name( ) elif result_dataset_id.startswith( xlib.get_toa_process_recreate_toa_database_code() + '-'): bioinfo_app_name = xlib.get_get_toa_process_recreate_toa_database_name( ) elif result_dataset_id.startswith(xlib.get_tophat_code() + '-'): bioinfo_app_name = xlib.get_tophat_name() elif result_dataset_id.startswith(xlib.get_transabyss_code() + '-'): bioinfo_app_name = xlib.get_transabyss_name() elif result_dataset_id.startswith( xlib.get_transcript_filter_code() + '-'): bioinfo_app_name = xlib.get_transcript_filter_name() elif result_dataset_id.startswith( xlib.get_transcriptome_blastx_code() + '-'): bioinfo_app_name = xlib.get_transcriptome_blastx_name() elif result_dataset_id.startswith( xlib.get_transdecoder_code() + '-'): bioinfo_app_name = xlib.get_transdecoder_name() elif result_dataset_id.startswith(xlib.get_transrate_code() + '-'): bioinfo_app_name = xlib.get_transrate_name() elif result_dataset_id.startswith(xlib.get_trimmomatic_code() + '-'): bioinfo_app_name = xlib.get_trimmomatic_name() elif result_dataset_id.startswith(xlib.get_trinity_code() + '-'): bioinfo_app_name = xlib.get_trinity_name() elif result_dataset_id.startswith( xlib.get_variant_calling_code() + '-'): bioinfo_app_name = xlib.get_variant_calling_name() elif result_dataset_id.startswith(xlib.get_vcftools_code() + '-'): bioinfo_app_name = xlib.get_vcftools_name() elif result_dataset_id.startswith( xlib.get_vcftools_perl_libraries_code() + '-'): bioinfo_app_name = xlib.get_vcftools_perl_libraries_name() elif result_dataset_id.startswith(xlib.get_vsearch_code() + '-'): bioinfo_app_name = xlib.get_vsearch_name() else: bioinfo_app_name = 'xxx' print(line.format(result_dataset_id, bioinfo_app_name)) # close the SSH client connection if OK: xssh.close_ssh_client_connection(ssh_client) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')