def build_htseq_count_process_script(cluster_name, current_run_dir): ''' Build the current htseq-count process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the htseq-count option dictionary htseq_count_option_dict = xlib.get_option_dict( get_htseq_count_config_file()) # get the options experiment_id = htseq_count_option_dict['identification']['experiment_id'] reference_dataset_id = htseq_count_option_dict['identification'][ 'reference_dataset_id'] annotation_file = htseq_count_option_dict['identification'][ 'annotation_file'] nprocesses = htseq_count_option_dict['htseq-count parameters'][ 'nprocesses'] stranded = htseq_count_option_dict['htseq-count parameters']['stranded'] minaqual = htseq_count_option_dict['htseq-count parameters']['minaqual'] type = htseq_count_option_dict['htseq-count parameters']['type'] idattr = htseq_count_option_dict['htseq-count parameters']['idattr'] mode = htseq_count_option_dict['htseq-count parameters']['mode'] nonunique = htseq_count_option_dict['htseq-count parameters']['nonunique'] other_parameters = htseq_count_option_dict['htseq-count parameters'][ 'other_parameters'] # get the sections list sections_list = [] for section in htseq_count_option_dict.keys(): sections_list.append(section) sections_list.sort() # build alignment dataset identification list alignment_software_list = [] alignment_dataset_id_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^alignment-dataset-[0-9]+$', section): alignment_software_list.append( htseq_count_option_dict[section]['alignment_software']) alignment_dataset_id_list.append( htseq_count_option_dict[section]['alignment_dataset_id']) # set the annotation file path annotation_file = xlib.get_cluster_reference_file(reference_dataset_id, annotation_file) # write the htseq-count process script try: if not os.path.exists(os.path.dirname( get_htseq_count_process_script())): os.makedirs(os.path.dirname(get_htseq_count_process_script())) with open(get_htseq_count_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function print_htseq_count_version\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_htseq_anaconda_code()}\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' # -- htseq-count --version\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_htseq_count_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_htseq_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Counting reads ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format(separator=False)}" \\\n' ) script_file_id.write(' htseq-count \\\n') script_file_id.write(f' --nprocesses={nprocesses} \\\n') script_file_id.write(' --format=bam \\\n') script_file_id.write( f' --stranded={stranded.lower()} \\\n') script_file_id.write(f' --minaqual={minaqual} \\\n') script_file_id.write(f' --type={type} \\\n') script_file_id.write(f' --idattr={idattr} \\\n') script_file_id.write(f' --mode={mode.lower()} \\\n') script_file_id.write( f' --nonunique={nonunique.lower()} \\\n') script_file_id.write(' --quiet \\\n') if other_parameters.upper() != 'NONE': parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write( f' --{parameter_name}={parameter_value} \\\n' ) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() script_file_id.write( f' --{parameter_name} \\\n') for i in range(len(alignment_dataset_id_list)): alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam' script_file_id.write(f' {alignment_files} \\\n') script_file_id.write(f' {annotation_file} \\\n') script_file_id.write(f' > read-count.txt\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error htseq-count $RC; fi\n' ) script_file_id.write(' echo "Reads are counted."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_htseq_count_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('print_htseq_count_version\n') script_file_id.write('run_htseq_count_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_htseq_count_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = busco_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data_url = busco_option_dict['BUSCO parameters'][ 'lineage_data_url'] mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters'][ 'augustus_options'].upper() # get the file and name from the lineage data url lineage_data_file = lineage_data_url.split("/")[-1] # -- lineage_data = lineage_data_file[:lineage_data_file.find('.tar.gz')] point_pos = lineage_data_file.find('.') lineage_data = lineage_data_file[:point_pos] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function download_lineage_data\n') script_file_id.write('{\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Downloading lineage data ..."\n') download_script = f'import requests; r = requests.get(\'{lineage_data_url}\') ; open(\'{lineage_data_file}\' , \'wb\').write(r.content)' script_file_id.write( f' $MINICONDA3_BIN_PATH/python3 -c "{download_script}"\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error download_script $RC; fi\n' ) script_file_id.write(f' tar -xzvf ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n') script_file_id.write(f' rm ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error rm $RC; fi\n') script_file_id.write(' echo "Lineage data are downloaded."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_busco_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_busco_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Assessing the transcriptome quality ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format(separator=False)}" \\\n' ) script_file_id.write(' busco \\\n') script_file_id.write(f' --cpu={ncpu} \\\n') script_file_id.write( f' --lineage_dataset=./{lineage_data} \\\n') script_file_id.write(f' --mode={mode} \\\n') script_file_id.write(f' --evalue={evalue} \\\n') script_file_id.write(f' --limit={limit} \\\n') if species.upper() != 'NONE': script_file_id.write(f' --species={species} \\\n') if long == 'YES': script_file_id.write(' --long \\\n') if augustus_options.upper() != 'NONE': script_file_id.write( f' --august_options="{augustus_options}" \\\n') script_file_id.write(f' --in={transcriptome_file} \\\n') script_file_id.write( f' --out={os.path.basename(current_run_dir)}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi\n' ) script_file_id.write(' echo "The assessment is done."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_busco_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('download_lineage_data\n') script_file_id.write('run_busco_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_busco_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_gzip_process_script(cluster_name, dataset_type, current_run_dir): ''' Build the current gzip process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the gzip option dictionary gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type)) # get the options experiment_id = gzip_option_dict['identification']['experiment_id'] dataset_type_2 = gzip_option_dict['identification']['dataset_type'] dataset_id = gzip_option_dict['identification']['dataset_id'] action = gzip_option_dict['gzip parameters']['action'] # get the sections list sections_list = [] for section in gzip_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the dataset subdirectory and file name lists dataset_subdirectory_list = [] file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): dataset_subdirectory = gzip_option_dict[section]['dataset_subdirectory'] dataset_subdirectory_list.append(dataset_subdirectory) file_name = gzip_option_dict[section]['file_name'] file_name_list.append(file_name) # get the dataset directory if dataset_type_2 == 'reference': dataset_dir = xlib.get_cluster_reference_dataset_dir(dataset_id) elif dataset_type_2 == 'database': dataset_dir = xlib.get_cluster_database_dataset_dir(dataset_id) elif dataset_type_2 == 'read': dataset_dir = xlib.get_cluster_experiment_read_dataset_dir(experiment_id, dataset_id) elif dataset_type_2 == 'result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id) elif dataset_type_2 == 'whole-result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id) # write the gzip process script try: if not os.path.exists(os.path.dirname(get_gzip_process_script(dataset_type_2))): os.makedirs(os.path.dirname(get_gzip_process_script(dataset_type_2))) with open(get_gzip_process_script(dataset_type_2), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( '{0}\n'.format('function run_gzip_process')) script_file_id.write( '{\n') if dataset_type_2 in ['reference', 'database', 'read', 'result']: script_file_id.write(f' cd {current_run_dir}\n') for i in range(len(dataset_subdirectory_list)): script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Compressing/decompressing {0}/{1}/{2} ..."'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) if action == 'compress': script_file_id.write( '{0}\n'.format(' gzip {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) elif action == 'decompress': script_file_id.write( '{0}\n'.format(' gzip --decompress {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) script_file_id.write( ' RC=$?\n') script_file_id.write( '{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error gzip $RC; fi')) elif dataset_type_2 == 'whole-result': script_file_id.write(f' cd {current_run_dir}\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Compressing/decompressing {0} ..."'.format(dataset_dir))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) if action == 'compress': script_file_id.write( '{0}\n'.format(' tar --create --gzip --verbose --file={0}.tar.gz {0}'.format(dataset_dir))) elif action == 'decompress': script_file_id.write( '{0}\n'.format(' tar --extract --gzip --verbose --file={0} --directory=/'.format(dataset_dir))) script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Removing {0} ..."'.format(dataset_dir))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) script_file_id.write( '{0}\n'.format(' rm -rf {0}'.format(dataset_dir))) script_file_id.write( ' RC=$?\n') script_file_id.write( '{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error rm $RC; fi')) script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_gzip_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( '{0}\n'.format('run_gzip_process')) script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The file {0} can not be created'.format(get_gzip_process_script(dataset_type_2))) OK = False # return the control variable and the error list return (OK, error_list)
def build_infrastructure_software_installation_script(cluster_name): ''' Build the infrastructure software installation script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the connetion data (user_id, access_key_id, secret_access_key) = xconfiguration.get_basic_aws_data() # get the old region and user identification current_region_name = xconfiguration.get_current_region_name() # get the NGScloud config file ngscloud_config_file = xconfiguration.get_ngscloud_config_file() # get the option dictionary corresponding to the NGScloud config file ngscloud_options_dict = xlib.get_option_dict(ngscloud_config_file) # get the dataset structure and NGScloud_volume dataset_structure = ngscloud_options_dict['dataset info'][ 'dataset_structure'] # write the infrastructure software installation script try: if not os.path.exists( os.path.dirname( get_infrastructure_software_installation_script())): os.makedirs( os.path.dirname( get_infrastructure_software_installation_script())) with open(get_infrastructure_software_installation_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') if dataset_structure in [ xconfiguration.get_dataset_structure_singlevolume(), xconfiguration.get_dataset_structure_none() ]: script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function create_dataset_structure\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Creating the dataset structure ..."\n') script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_app_dir()}\n') script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_database_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_read_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_reference_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_result_dir()}\n' ) script_file_id.write( ' echo "The dataset structure is created."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_awscli\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Installing the AWS CLI ..."\n') script_file_id.write(f' unzip {xlib.get_awscli_name()}.zip\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then unzip $RC; fi\n') script_file_id.write(' sudo ./aws/install\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then install $RC; fi\n') script_file_id.write(' rm -rf aws\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then rm $RC; fi\n') script_file_id.write(f' rm {xlib.get_awscli_name()}.zip\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then rm $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function setup_aws\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Setting up AWS ..."\n') script_file_id.write(' UBUNTU_AWS_DIR=/home/ubuntu/.aws\n') script_file_id.write(' mkdir --parents $UBUNTU_AWS_DIR\n') script_file_id.write(f' CONFIG_FILE=$UBUNTU_AWS_DIR/config\n') script_file_id.write(' echo "[default]" > $CONFIG_FILE\n') script_file_id.write( f' echo "region = {current_region_name}" >> $CONFIG_FILE\n') script_file_id.write( ' CREDENTIALS_FILE=$UBUNTU_AWS_DIR/credentials\n') script_file_id.write(' echo "[default]" > $CREDENTIALS_FILE\n') script_file_id.write( f' echo "aws_access_key_id = {access_key_id}" >> $CREDENTIALS_FILE\n' ) script_file_id.write( f' echo "aws_secret_access_key = {secret_access_key}" >> $CREDENTIALS_FILE\n' ) script_file_id.write(' sudo echo "AWS is set up."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function fix_source_list\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Fixing file /etc/apt/sources.list ..."\n') script_file_id.write( ' sed -i "s/us-east-1.ec2.archive.ubuntu.com/old-releases.ubuntu.com/g" /etc/apt/sources.list\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n') script_file_id.write( ' sed -i "s/security.ubuntu.com/old-releases.ubuntu\.com/g" /etc/apt/sources.list\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n') script_file_id.write(' apt-get update\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The file is fixed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_xorg\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package xorg ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install xorg\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_libtbb2\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package libtbb2 ..."\n') script_file_id.write(' echo\n') script_file_id.write(' apt-get --assume-yes install libtbb2\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_libxt6\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package libxt6 ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install libxt6\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_parallel\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package parallel ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install parallel\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_texlive\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package texlive ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-latex-base\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-fonts-recommended\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-fonts-extra\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-latex-extra\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function uninstall_mysql\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Uninstalling MySQL ..."\n') script_file_id.write( ' sudo apt-get purge --auto-remove --assume-yes mysql-client mysql-client-5.5 mysql-client-core-5.5 mysql-common mysql-server mysql-server-5.5 mysql-server-core-5.5\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "MySQL is uninstalled."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function create_swapfile\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Creating a file which will be used for swap ..."\n') script_file_id.write( ' sudo dd if=/dev/zero of=/swapfile bs=1024 count=2097152\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error dd $RC; fi\n') script_file_id.write(' sudo chmod 600 /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error chmod $RC; fi\n') script_file_id.write(' sudo mkswap /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error mkswap $RC; fi\n') script_file_id.write(' sudo swapon /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error swapon $RC; fi\n') script_file_id.write( ' sudo echo "/swapfile swap swap defaults 0 0" >> /etc/fstab\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error echo $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The file is created."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = 'Infrastructure software installation' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') if dataset_structure in [ xconfiguration.get_dataset_structure_singlevolume(), xconfiguration.get_dataset_structure_none() ]: script_file_id.write('create_dataset_structure\n') script_file_id.write('install_awscli\n') script_file_id.write('setup_aws\n') script_file_id.write('fix_source_list\n') script_file_id.write('install_xorg\n') script_file_id.write('install_libtbb2\n') script_file_id.write('install_libxt6\n') script_file_id.write('install_parallel\n') script_file_id.write('install_texlive\n') script_file_id.write('uninstall_mysql\n') # -- script_file_id.write( 'create_swapfile\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_infrastructure_software_installation_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # set the output file path if OK: output_file = f'{current_run_dir}/clustered-transcriptome.fasta' # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_cd_hit_est_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_cd_hit_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Filtering transcriptome ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format()}" \\\n') script_file_id.write(' cd-hit-est \\\n') script_file_id.write(f' -T {threads} \\\n') script_file_id.write(f' -M {memory_limit} \\\n') script_file_id.write(f' -i {transcriptome_file} \\\n') script_file_id.write( f' -c {seq_identity_threshold} \\\n') script_file_id.write(f' -n {word_length} \\\n') script_file_id.write(f' -mask {mask} \\\n') script_file_id.write(f' -match {match} \\\n') script_file_id.write(f' -mismatch {mismatch} \\\n') if other_parameters.upper() == 'NONE': script_file_id.write(f' -o {output_file}\n') else: script_file_id.write(f' -o {output_file} \\\n') parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: script_file_id.write( f' -{parameter_name} {parameter_value} \\\n' ) else: script_file_id.write( f' -{parameter_name} {parameter_value}\n' ) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): script_file_id.write( f' -{parameter_name} \\\n') else: script_file_id.write( f' -{parameter_name}\n') i += 1 script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi\n') script_file_id.write(' echo "The transcriptome is filtered."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_cd_hit_est_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('run_cd_hit_est_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_cd_hit_est_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_express_process_script(cluster_name, current_run_dir): ''' Build the current eXpress process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the eXpress option dictionary express_option_dict = xlib.get_option_dict(get_express_config_file()) # get the options experiment_id = express_option_dict['identification']['experiment_id'] assembly_software = express_option_dict['identification']['assembly_software'] assembly_dataset_id = express_option_dict['identification']['assembly_dataset_id'] assembly_type = express_option_dict['identification']['assembly_type'] frag_len_mean = express_option_dict['eXpress parameters']['frag-len-mean'] frag_len_stddev = express_option_dict['eXpress parameters']['frag-len-stddev'] library_type = express_option_dict['eXpress parameters']['library_type'] max_indel_size = express_option_dict['eXpress parameters']['max-indel-size'] no_bias_correct = express_option_dict['eXpress parameters']['no-bias-correct'] no_error_model = express_option_dict['eXpress parameters']['no-error-model'] other_parameters = express_option_dict['eXpress parameters']['other_parameters'] # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # build alignment dataset identification list alignment_software_list = [] alignment_dataset_id_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^alignment-dataset-[0-9]+$', section): alignment_software_list.append(express_option_dict[section]['alignment_software']) alignment_dataset_id_list.append(express_option_dict[section]['alignment_dataset_id']) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the eXpress process script try: if not os.path.exists(os.path.dirname(get_express_process_script())): os.makedirs(os.path.dirname(get_express_process_script())) with open(get_express_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n') script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function run_express_process\n') script_file_id.write( '{\n') script_file_id.write(f' source activate {xlib.get_express_anaconda_code()}\n') script_file_id.write(f' cd $CURRENT_DIR\n') for i in range(len(alignment_dataset_id_list)): alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam' script_file_id.write(f' SORTED_BAM_LIST={alignment_dataset_id_list[i]}-sorted-bam-files.txt\n') script_file_id.write(f' ls {alignment_files} > $SORTED_BAM_LIST\n') script_file_id.write( ' while read FILE_BAM; do\n') script_file_id.write( ' NAME=`basename $FILE_BAM`\n') script_file_id.write( ' NAME=${NAME:0:-11}\n') script_file_id.write(f' SUBDIR={alignment_dataset_id_list[i]}-$NAME\n') script_file_id.write(f' mkdir --parents $CURRENT_DIR/$SUBDIR\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "Quantitating alignment dataset {alignment_dataset_id_list[i]} - library $SUBDIR ..."\n') script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write(f' --format="{xlib.get_time_output_format(separator=False)}" \\\n') script_file_id.write( ' express \\\n') script_file_id.write( ' --no-update-check \\\n') script_file_id.write(f' --frag-len-mean {frag_len_mean} \\\n') script_file_id.write(f' --frag-len-stddev {frag_len_stddev} \\\n') if library_type.lower() == 'fr-stranded': script_file_id.write( ' --fr-stranded \\\n') elif library_type.lower() == 'rf-stranded': script_file_id.write( ' --rf-stranded \\\n') elif library_type.lower() == 'f-stranded': script_file_id.write( ' --f-stranded \\\n') elif library_type.lower() == 'r-stranded': script_file_id.write( ' --r-stranded \\\n') script_file_id.write(f' --max-indel-size {max_indel_size} \\\n') if no_bias_correct.upper() == 'YES': script_file_id.write( ' --no-bias-correct \\\n') if no_error_model.upper() == 'YES': script_file_id.write( ' --no-error-model \\\n') if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write(f' --{parameter_name}={parameter_value} \\\n') else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() script_file_id.write(f' --{parameter_name} \\\n') script_file_id.write( ' --output-dir $CURRENT_DIR/$SUBDIR \\\n') script_file_id.write(f' {transcriptome_file} \\\n') script_file_id.write( ' $FILE_BAM\n') script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error express $RC; fi\n') script_file_id.write( ' echo "Quantitation is done."\n') script_file_id.write( ' done < $SORTED_BAM_LIST\n') script_file_id.write( ' conda deactivate\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_express_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( 'run_express_process\n') script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append(f'*** ERROR: The file {get_express_process_script()} can not be created') OK = False # return the control variable and the error list return (OK, error_list)
def build_fastqc_process_script(cluster_name, current_run_dir): ''' Build the current FastQC process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the FastQC option dictionary fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file()) # get the options experiment_id = fastqc_option_dict['identification']['experiment_id'] read_dataset_id = fastqc_option_dict['identification']['read_dataset_id'] threads = fastqc_option_dict['FastQC parameters']['threads'] # get the sections list sections_list = [] for section in fastqc_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the file name list file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): file_name = fastqc_option_dict[section]['file_name'] file_name_list.append(file_name) # write the FastQC process script try: if not os.path.exists(os.path.dirname(get_fastqc_process_script())): os.makedirs(os.path.dirname(get_fastqc_process_script())) with open(get_fastqc_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_fastqc_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_fastqc_anaconda_code()}\n') script_file_id.write(f' cd {current_run_dir}\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' fastqc --version\n') for file_name in file_name_list: script_file_id.write(' echo "$SEP"\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format()}" \\\n') script_file_id.write(' fastqc \\\n') script_file_id.write( f' {xlib.get_cluster_read_file(experiment_id, read_dataset_id, file_name)} \\\n' ) script_file_id.write(f' --threads={threads} \\\n') script_file_id.write( f' --outdir={current_run_dir}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error fastqc $RC; fi\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_fastqc_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('run_fastqc_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_fastqc_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_cutadapt_process_script(cluster_name, current_run_dir): ''' Build the current cutadapt process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the cutadapt option dictionary cutadapt_option_dict = xlib.get_option_dict(get_cutadapt_config_file()) # get the options experiment_id = cutadapt_option_dict['identification']['experiment_id'] read_dataset_id = cutadapt_option_dict['identification']['read_dataset_id'] cores = cutadapt_option_dict['cutadapt parameters']['cores'] adapter = cutadapt_option_dict['cutadapt parameters']['adapter'] adapter_pe = cutadapt_option_dict['cutadapt parameters']['adapter_pe'] front = cutadapt_option_dict['cutadapt parameters']['front'] front_pe = cutadapt_option_dict['cutadapt parameters']['front_pe'] anywhere = cutadapt_option_dict['cutadapt parameters']['anywhere'] anywhere_pe = cutadapt_option_dict['cutadapt parameters']['anywhere_pe'] other_parameters = cutadapt_option_dict['cutadapt parameters']['other_parameters'] format = cutadapt_option_dict['library']['format'] read_type = cutadapt_option_dict['library']['read_type'] # get the sections list sections_list = [] for section in cutadapt_option_dict.keys(): sections_list.append(section) sections_list.sort() # build read file lists read_file_1_list = [] read_file_2_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^library-[0-9]+$', section): read_file_1 = cutadapt_option_dict[section]['read_file_1'] read_file_1 = xlib.get_cluster_read_file(experiment_id, read_dataset_id, read_file_1) read_file_1_list.append(read_file_1) if read_type.upper() == 'PE': read_file_2 = cutadapt_option_dict[section]['read_file_2'] read_file_2 = xlib.get_cluster_read_file(experiment_id, read_dataset_id, read_file_2) read_file_2_list.append(read_file_2) # get the output read directory output_read_dir = xlib.get_cluster_experiment_read_dataset_dir(experiment_id, os.path.basename(current_run_dir)) # write the cutadapt process script try: if not os.path.exists(os.path.dirname(get_cutadapt_process_script())): os.makedirs(os.path.dirname(get_cutadapt_process_script())) with open(get_cutadapt_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'CUTADAPT_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/envs/{xlib.get_cutadapt_anaconda_code()}/bin\n') script_file_id.write( 'export PATH=$CUTADAPT_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function run_cutadapt_process\n') script_file_id.write( '{\n') script_file_id.write(f' mkdir --parents {output_read_dir}\n') script_file_id.write(f' cd {current_run_dir}\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "cutadapt v`cutadapt --version`"\n') for i in range(len(read_file_1_list)): script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write(f' --format="{xlib.get_time_output_format()}" \\\n') script_file_id.write( ' cutadapt \\\n') script_file_id.write(f' --cores={cores} \\\n') script_file_id.write(f' --adapter={adapter} \\\n') if adapter_pe.upper() != 'NONE': script_file_id.write(f' -A {adapter_pe} \\\n') if front.upper() != 'NONE': script_file_id.write(f' --front {front} \\\n') if front_pe.upper() != 'NONE': script_file_id.write(f' -G {front_pe} \\\n') if anywhere.upper() != 'NONE': script_file_id.write(f' --anywhere {anywhere} \\\n') if anywhere_pe.upper() != 'NONE': script_file_id.write(f' -B {anywhere_pe} \\\n') if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for j in range(len(parameter_list)): if parameter_list[j].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[j]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write(f' --{parameter_name}={parameter_value} \\\n') else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[j]) parameter_name = mo.group(1).strip() script_file_id.write(f' --{parameter_name} \\\n') if read_type.upper() == 'SE': script_file_id.write(f' --output={output_read_dir}/{os.path.basename(read_file_1_list[i])} \\\n') script_file_id.write(f' {read_file_1_list[i]}\n') elif read_type.upper() == 'PE': script_file_id.write(f' --output={output_read_dir}/{os.path.basename(read_file_1_list[i])} \\\n') script_file_id.write(f' --paired-output={output_read_dir}/{os.path.basename(read_file_2_list[i])} \\\n') script_file_id.write(f' {read_file_1_list[i]} \\\n') script_file_id.write(f' {read_file_2_list[i]}\n') script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error cutadapt $RC; fi\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_cutadapt_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( 'run_cutadapt_process\n') script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append(f'*** ERROR: The file {get_cutadapt_process_script()} can not be created.') OK = False # return the control variable and the error list return (OK, error_list)