def form_restart_pipeline_process(pipeline_type): ''' Restart a pipeline process from the last step ended OK. ''' # initialize the control variable OK = True # set the pipeline name if pipeline_type == xlib.get_toa_process_pipeline_nucleotide_code(): name = xlib.get_toa_process_pipeline_nucleotide_name() elif pipeline_type == xlib.get_toa_process_pipeline_aminoacid_code(): name = xlib.get_toa_process_pipeline_aminoacid_name() # print the header clib.clear_screen() clib.print_headers_with_environment(f'{name} - Run process') # get the pipeline dataset identification app_list = [pipeline_type] pipeline_dataset_id = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id == '': print(f'WARNING: There are not any {pipeline_type} result datasets.') OK = False # confirm the process run if OK: print(xlib.get_separator()) OK = clib.confirm_action(f'The {name} process is going to be run.') # run the process if OK: devstdout = xlib.DevStdOut(xtoa.restart_pipeline_process.__name__) OK = xtoa.restart_pipeline_process(pipeline_type, pipeline_dataset_id, devstdout, function=None) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def form_view_phylogenic_data_frecuency(stats_code): ''' View the frecuency distribution of phylogenic data. ''' # initialize the control variable OK = True # assign the text of the "name" if stats_code == 'species': name = 'Species - Frequency distribution' elif stats_code == 'family': name = 'Family - Frequency distribution' elif stats_code == 'phylum': name = 'Phylum - Frequency distribution' elif stats_code == 'namespace': name = 'GO - Frequency distribution per namespace' # print the header clib.clear_screen() clib.print_headers_with_environment(f'Statistics - {name} data') # get the pipeline dataset identification app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code(), xlib.get_toa_process_merge_annotations_code() ] pipeline_dataset_id = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id == '': print( 'WARNING: There are not any annotation pipeline result datasets.') OK = False # build distribution dictionary if OK: # initialize the distribution dictionary distribution_dict = {} # get the dictionary of TOA configuration toa_config_dict = xtoa.get_toa_config_dict() # get the statistics file path stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/{stats_code}-{toa_config_dict["STATS_BASE_NAME"]}.csv' # open the statistics file if stats_file.endswith('.gz'): try: stats_file_id = gzip.open(stats_file, mode='rt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F002', stats_file) else: try: stats_file_id = open(stats_file, mode='r', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F001', stats_file) # initialize the record counter record_counter = 0 # initialize the header record control header_record = True # read the first record record = stats_file_id.readline() # while there are records while record != '': # add 1 to the record counter record_counter += 1 # process the header record if header_record: header_record = False # process data records else: # extract data # record format: "stats_code_id";"all_count";"first_hsp_count";"min_evalue_count" data_list = [] begin = 0 for end in [i for i, chr in enumerate(record) if chr == ';']: data_list.append(record[begin:end].strip('"')) begin = end + 1 data_list.append(record[begin:].strip('\n').strip('"')) try: id = data_list[0] all_count = data_list[1] first_hsp_count = data_list[2] min_evalue_count = data_list[3] except Exception as e: raise xlib.ProgramException('F006', os.path.basename(stats_file), record_counter) # add dato to the dictionary distribution_dict[id] = { 'id': id, 'all_count': all_count, 'first_hsp_count': first_hsp_count, 'min_evalue_count': min_evalue_count } # read the next record record = stats_file_id.readline() # print the distribution if OK: print(xlib.get_separator()) if distribution_dict == {}: print('*** WARNING: There is not any distribution.') else: # set data width id_width = 50 all_count_width = 11 first_hsp_count_width = 11 min_evalue_count_width = 11 # set line template line_template = '{0:' + str(id_width) + '} {1:' + str( all_count_width) + '} {2:' + str( first_hsp_count_width) + '} {3:' + str( min_evalue_count_width) + '}' # print header print( line_template.format(stats_code.capitalize(), 'All', 'First HSP', 'Min e-value')) print( line_template.format('=' * id_width, '=' * all_count_width, '=' * first_hsp_count_width, '=' * min_evalue_count_width)) # print detail lines for key in sorted(distribution_dict.keys()): print( line_template.format( distribution_dict[key]['id'], distribution_dict[key]['all_count'], distribution_dict[key]['first_hsp_count'], distribution_dict[key]['min_evalue_count'])) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def form_view_dataset_data_frecuency(): ''' View the frecuency distribution of annotation dataset data. ''' # initialize the control variable OK = True # print the header clib.clear_screen() clib.print_headers_with_environment( 'Statistics - Annotation datasets - Frequency distribution data') # get the pipeline dataset identification app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code() ] pipeline_dataset_id = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id == '': print( 'WARNING: There are not any annotation pipeline result datasets.') OK = False # build distribution dictionary if OK: # initialize the distribution dictionary distribution_dict = {} # get the dictionary of TOA configuration toa_config_dict = xtoa.get_toa_config_dict() # get the statistics file path stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/dataset-{toa_config_dict["STATS_BASE_NAME"]}.csv' # open the statistics file if stats_file.endswith('.gz'): try: stats_file_id = gzip.open(stats_file, mode='rt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F002', stats_file) else: try: stats_file_id = open(stats_file, mode='r', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F001', stats_file) # initialize the record counter record_counter = 0 # initialize the header record control header_record = True # read the first record record = stats_file_id.readline() # while there are records while record != '': # add 1 to the record counter record_counter += 1 # process the header record if header_record: header_record = False # process data records else: # extract data # record format: "dataset_name";"annotated_seq_count";"remained_seq_count" data_list = [] begin = 0 for end in [i for i, chr in enumerate(record) if chr == ';']: data_list.append(record[begin:end].strip('"')) begin = end + 1 data_list.append(record[begin:].strip('\n').strip('"')) try: dataset_name = data_list[0] annotated_seq_count = data_list[1] remained_seq_count = data_list[2] except Exception as e: raise xlib.ProgramException('F006', os.path.basename(stats_file), record_counter) # add dato to the dictionary distribution_dict[record_counter] = { 'dataset_name': dataset_name, 'annotated_seq_count': annotated_seq_count, 'remained_seq_count': remained_seq_count } # read the next record record = stats_file_id.readline() # print the distribution if OK: print(xlib.get_separator()) if distribution_dict == {}: print('*** WARNING: There is not any distribution.') else: # set data width dataset_name_width = 19 annotated_seq_count_width = 14 remained_seq_count_width = 14 # set line template line_template = '{0:' + str(dataset_name_width) + '} {1:' + str( annotated_seq_count_width) + '} {2:' + str( remained_seq_count_width) + '}' # print header print( line_template.format('Dataset', 'Annotated seqs', 'Remained seqs')) print( line_template.format('=' * dataset_name_width, '=' * annotated_seq_count_width, '=' * remained_seq_count_width)) # print detail lines for key in sorted(distribution_dict.keys()): print( line_template.format( distribution_dict[key]['dataset_name'], distribution_dict[key]['annotated_seq_count'], distribution_dict[key]['remained_seq_count'])) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def form_view_x_per_y_data(stats_code): ''' View the x per y data. ''' # initialize the control variable OK = True # assign the text of the "name" if stats_code == 'hit_per_hsp': name = '# HITs per # HSPs' elif stats_code == 'seq_per_go': name = '# sequences per # GO terms' elif stats_code == 'seq_per_ec': name = '# sequences per # EC ids' elif stats_code == 'seq_per_interpro': name = '# sequences per # InterPro ids' elif stats_code == 'seq_per_kegg': name = '# sequences per # KEGG ids' elif stats_code == 'seq_per_mapman': name = '# sequences per # MapMan ids' elif stats_code == 'seq_per_metacyc': name = '# sequences per # MetaCyc ids' # print the header clib.clear_screen() clib.print_headers_with_environment(f'Statistics - {name} data') # get the pipeline dataset identification if stats_code == 'hit_per_hsp': app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code() ] else: app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code(), xlib.get_toa_process_merge_annotations_code() ] pipeline_dataset_id = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id == '': print( 'WARNING: There are not any annotation pipeline result datasets.') OK = False # build distribution dictionary if OK: # initialize the distribution dictionary distribution_dict = {} # get the dictionary of TOA configuration toa_config_dict = xtoa.get_toa_config_dict() # get the statistics file path stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/{stats_code}-{toa_config_dict["STATS_BASE_NAME"]}.csv' # open the statistics file if stats_file.endswith('.gz'): try: stats_file_id = gzip.open(stats_file, mode='rt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F002', stats_file) else: try: stats_file_id = open(stats_file, mode='r', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F001', stats_file) # initialize the record counter record_counter = 0 # initialize the header record control header_record = True # read the first record record = stats_file_id.readline() # while there are records while record != '': # add 1 to the record counter record_counter += 1 # process the header record if header_record: header_record = False # process data records else: # extract data # record format: "x_count";"y_count" data_list = [] begin = 0 for end in [i for i, chr in enumerate(record) if chr == ';']: data_list.append(record[begin:end].strip('"')) begin = end + 1 data_list.append(record[begin:].strip('\n').strip('"')) try: x_count = data_list[0] y_count = data_list[1] except Exception as e: raise xlib.ProgramException('F006', os.path.basename(stats_file), record_counter) # add dato to the dictionary distribution_dict[record_counter] = { 'x_count': x_count, 'y_count': y_count } # read the next record record = stats_file_id.readline() # print the distribution if OK: print(xlib.get_separator()) if distribution_dict == {}: print('*** WARNING: There is not any stats data.') else: # set data width x_count_width = 15 y_count_width = 15 # set line template line_template = '{0:' + str(x_count_width) + '} {1:' + str( y_count_width) + '}' # print header if stats_code == 'hit_per_hsp': print(line_template.format('# HSPs', '# HITs')) elif stats_code == 'seq_per_go': print(line_template.format('# GO terms', '# sequences')) elif stats_code == 'seq_per_ec': print(line_template.format('# EC ids', '# sequences')) elif stats_code == 'seq_per_interpro': print(line_template.format('# InterPro ids', '# sequences')) elif stats_code == 'seq_per_kegg': print(line_template.format('# KEGG ids', '# sequences')) elif stats_code == 'seq_per_mapman': print(line_template.format('# MapMan ids', '# sequences')) elif stats_code == 'seq_per_metacyc': print(line_template.format('# MetaCyc ids', '# sequences')) print( line_template.format('=' * x_count_width, '=' * y_count_width)) # print detail lines for key in sorted(distribution_dict.keys()): print( line_template.format(distribution_dict[key]['x_count'], distribution_dict[key]['y_count'])) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def form_recreate_annotation_merger_config_file(): ''' Recreate the annotation merger config file. ''' # initialize the control variable OK = True # print the header clib.clear_screen() clib.print_headers_with_environment( f'{xlib.get_toa_process_merge_annotations_name()} - Recreate config file' ) # get the identification of the first pipeline dataset app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code(), xlib.get_toa_process_merge_annotations_code() ] print('First pipeline ...') pipeline_dataset_id_1 = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id_1 == '': print('WARNING: There are not any pipeline datasets.') OK = False # get the identification of the second pipeline dataset app_list = [ xlib.get_toa_process_pipeline_nucleotide_code(), xlib.get_toa_process_pipeline_aminoacid_code(), xlib.get_toa_process_merge_annotations_code() ] print('Second pipeline ...') pipeline_dataset_id_2 = cinputs.input_result_dataset_id( xlib.get_toa_result_pipeline_dir(), app_list) if pipeline_dataset_id_2 == '': print('WARNING: There are not any pipeline datasets.') OK = False elif pipeline_dataset_id_1 == pipeline_dataset_id_2: print('ERROR: The first pipeline dataset is equal to the second one.') OK = False # get the merger operation if OK: merger_operation = cinputs.input_code( text='Merger operation', code_list=xlib.get_annotation_merger_operation_code_list(), default_code=None).upper() # recreate the pipeline config file if OK: # confirm the creation of the config file print(xlib.get_separator()) OK = clib.confirm_action( f'The file {xtoa.get_annotation_merger_config_file()} is going to be recreated. The previous files will be lost.' ) # recreate the config file if OK: (OK, error_list) = xtoa.create_annotation_merger_config_file( pipeline_dataset_id_1, pipeline_dataset_id_2, merger_operation) if OK: print('The file is recreated.') else: for error in error_list: print(error) # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')