示例#1
0
def form_restart_pipeline_process(pipeline_type):
    '''
    Restart a pipeline process from the last step ended OK.
    '''

    # initialize the control variable
    OK = True

    # set the pipeline name
    if pipeline_type == xlib.get_toa_process_pipeline_nucleotide_code():
        name = xlib.get_toa_process_pipeline_nucleotide_name()
    elif pipeline_type == xlib.get_toa_process_pipeline_aminoacid_code():
        name = xlib.get_toa_process_pipeline_aminoacid_name()

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(f'{name} - Run process')

    # get the pipeline dataset identification
    app_list = [pipeline_type]
    pipeline_dataset_id = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id == '':
        print(f'WARNING: There are not any {pipeline_type} result datasets.')
        OK = False

    # confirm the process run
    if OK:
        print(xlib.get_separator())
        OK = clib.confirm_action(f'The {name} process is going to be run.')

    # run the process
    if OK:

        devstdout = xlib.DevStdOut(xtoa.restart_pipeline_process.__name__)
        OK = xtoa.restart_pipeline_process(pipeline_type,
                                           pipeline_dataset_id,
                                           devstdout,
                                           function=None)

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#2
0
def form_view_phylogenic_data_frecuency(stats_code):
    '''
    View the frecuency distribution of phylogenic data.
    '''

    # initialize the control variable
    OK = True

    # assign the text of the "name"
    if stats_code == 'species':
        name = 'Species - Frequency distribution'
    elif stats_code == 'family':
        name = 'Family - Frequency distribution'
    elif stats_code == 'phylum':
        name = 'Phylum - Frequency distribution'
    elif stats_code == 'namespace':
        name = 'GO - Frequency distribution per namespace'

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(f'Statistics - {name} data')

    # get the pipeline dataset identification
    app_list = [
        xlib.get_toa_process_pipeline_nucleotide_code(),
        xlib.get_toa_process_pipeline_aminoacid_code(),
        xlib.get_toa_process_merge_annotations_code()
    ]
    pipeline_dataset_id = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id == '':
        print(
            'WARNING: There are not any annotation pipeline result datasets.')
        OK = False

    # build distribution dictionary
    if OK:

        # initialize the distribution dictionary
        distribution_dict = {}

        # get the dictionary of TOA configuration
        toa_config_dict = xtoa.get_toa_config_dict()

        # get the statistics file path
        stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/{stats_code}-{toa_config_dict["STATS_BASE_NAME"]}.csv'

        # open the statistics file
        if stats_file.endswith('.gz'):
            try:
                stats_file_id = gzip.open(stats_file,
                                          mode='rt',
                                          encoding='iso-8859-1',
                                          newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F002', stats_file)
        else:
            try:
                stats_file_id = open(stats_file,
                                     mode='r',
                                     encoding='iso-8859-1',
                                     newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F001', stats_file)

        # initialize the record counter
        record_counter = 0

        # initialize the header record control
        header_record = True

        # read the first record
        record = stats_file_id.readline()

        # while there are records
        while record != '':

            # add 1 to the record counter
            record_counter += 1

            # process the header record
            if header_record:
                header_record = False

            # process data records
            else:

                # extract data
                # record format: "stats_code_id";"all_count";"first_hsp_count";"min_evalue_count"
                data_list = []
                begin = 0
                for end in [i for i, chr in enumerate(record) if chr == ';']:
                    data_list.append(record[begin:end].strip('"'))
                    begin = end + 1
                data_list.append(record[begin:].strip('\n').strip('"'))
                try:
                    id = data_list[0]
                    all_count = data_list[1]
                    first_hsp_count = data_list[2]
                    min_evalue_count = data_list[3]
                except Exception as e:
                    raise xlib.ProgramException('F006',
                                                os.path.basename(stats_file),
                                                record_counter)

                # add dato to the dictionary
                distribution_dict[id] = {
                    'id': id,
                    'all_count': all_count,
                    'first_hsp_count': first_hsp_count,
                    'min_evalue_count': min_evalue_count
                }

            # read the next record
            record = stats_file_id.readline()

    # print the distribution
    if OK:
        print(xlib.get_separator())
        if distribution_dict == {}:
            print('*** WARNING: There is not any distribution.')
        else:
            # set data width
            id_width = 50
            all_count_width = 11
            first_hsp_count_width = 11
            min_evalue_count_width = 11
            # set line template
            line_template = '{0:' + str(id_width) + '}   {1:' + str(
                all_count_width) + '}   {2:' + str(
                    first_hsp_count_width) + '}   {3:' + str(
                        min_evalue_count_width) + '}'
            # print header
            print(
                line_template.format(stats_code.capitalize(), 'All',
                                     'First HSP', 'Min e-value'))
            print(
                line_template.format('=' * id_width, '=' * all_count_width,
                                     '=' * first_hsp_count_width,
                                     '=' * min_evalue_count_width))
            # print detail lines
            for key in sorted(distribution_dict.keys()):
                print(
                    line_template.format(
                        distribution_dict[key]['id'],
                        distribution_dict[key]['all_count'],
                        distribution_dict[key]['first_hsp_count'],
                        distribution_dict[key]['min_evalue_count']))

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#3
0
def form_view_dataset_data_frecuency():
    '''
    View the frecuency distribution of annotation dataset data.
    '''

    # initialize the control variable
    OK = True

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(
        'Statistics - Annotation datasets - Frequency distribution data')

    # get the pipeline dataset identification
    app_list = [
        xlib.get_toa_process_pipeline_nucleotide_code(),
        xlib.get_toa_process_pipeline_aminoacid_code()
    ]
    pipeline_dataset_id = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id == '':
        print(
            'WARNING: There are not any annotation pipeline result datasets.')
        OK = False

    # build distribution dictionary
    if OK:

        # initialize the distribution dictionary
        distribution_dict = {}

        # get the dictionary of TOA configuration
        toa_config_dict = xtoa.get_toa_config_dict()

        # get the statistics file path
        stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/dataset-{toa_config_dict["STATS_BASE_NAME"]}.csv'

        # open the statistics file
        if stats_file.endswith('.gz'):
            try:
                stats_file_id = gzip.open(stats_file,
                                          mode='rt',
                                          encoding='iso-8859-1',
                                          newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F002', stats_file)
        else:
            try:
                stats_file_id = open(stats_file,
                                     mode='r',
                                     encoding='iso-8859-1',
                                     newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F001', stats_file)

        # initialize the record counter
        record_counter = 0

        # initialize the header record control
        header_record = True

        # read the first record
        record = stats_file_id.readline()

        # while there are records
        while record != '':

            # add 1 to the record counter
            record_counter += 1

            # process the header record
            if header_record:
                header_record = False

            # process data records
            else:

                # extract data
                # record format: "dataset_name";"annotated_seq_count";"remained_seq_count"
                data_list = []
                begin = 0
                for end in [i for i, chr in enumerate(record) if chr == ';']:
                    data_list.append(record[begin:end].strip('"'))
                    begin = end + 1
                data_list.append(record[begin:].strip('\n').strip('"'))
                try:
                    dataset_name = data_list[0]
                    annotated_seq_count = data_list[1]
                    remained_seq_count = data_list[2]
                except Exception as e:
                    raise xlib.ProgramException('F006',
                                                os.path.basename(stats_file),
                                                record_counter)

                # add dato to the dictionary
                distribution_dict[record_counter] = {
                    'dataset_name': dataset_name,
                    'annotated_seq_count': annotated_seq_count,
                    'remained_seq_count': remained_seq_count
                }

            # read the next record
            record = stats_file_id.readline()

    # print the distribution
    if OK:
        print(xlib.get_separator())
        if distribution_dict == {}:
            print('*** WARNING: There is not any distribution.')
        else:
            # set data width
            dataset_name_width = 19
            annotated_seq_count_width = 14
            remained_seq_count_width = 14
            # set line template
            line_template = '{0:' + str(dataset_name_width) + '}   {1:' + str(
                annotated_seq_count_width) + '}   {2:' + str(
                    remained_seq_count_width) + '}'
            # print header
            print(
                line_template.format('Dataset', 'Annotated seqs',
                                     'Remained seqs'))
            print(
                line_template.format('=' * dataset_name_width,
                                     '=' * annotated_seq_count_width,
                                     '=' * remained_seq_count_width))
            # print detail lines
            for key in sorted(distribution_dict.keys()):
                print(
                    line_template.format(
                        distribution_dict[key]['dataset_name'],
                        distribution_dict[key]['annotated_seq_count'],
                        distribution_dict[key]['remained_seq_count']))

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#4
0
def form_view_x_per_y_data(stats_code):
    '''
    View the x per y data.
    '''

    # initialize the control variable
    OK = True

    # assign the text of the "name"
    if stats_code == 'hit_per_hsp':
        name = '# HITs per # HSPs'
    elif stats_code == 'seq_per_go':
        name = '# sequences per # GO terms'
    elif stats_code == 'seq_per_ec':
        name = '# sequences per # EC ids'
    elif stats_code == 'seq_per_interpro':
        name = '# sequences per # InterPro ids'
    elif stats_code == 'seq_per_kegg':
        name = '# sequences per # KEGG ids'
    elif stats_code == 'seq_per_mapman':
        name = '# sequences per # MapMan ids'
    elif stats_code == 'seq_per_metacyc':
        name = '# sequences per # MetaCyc ids'

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(f'Statistics - {name} data')

    # get the pipeline dataset identification
    if stats_code == 'hit_per_hsp':
        app_list = [
            xlib.get_toa_process_pipeline_nucleotide_code(),
            xlib.get_toa_process_pipeline_aminoacid_code()
        ]
    else:
        app_list = [
            xlib.get_toa_process_pipeline_nucleotide_code(),
            xlib.get_toa_process_pipeline_aminoacid_code(),
            xlib.get_toa_process_merge_annotations_code()
        ]
    pipeline_dataset_id = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id == '':
        print(
            'WARNING: There are not any annotation pipeline result datasets.')
        OK = False

    # build distribution dictionary
    if OK:

        # initialize the distribution dictionary
        distribution_dict = {}

        # get the dictionary of TOA configuration
        toa_config_dict = xtoa.get_toa_config_dict()

        # get the statistics file path
        stats_file = f'{toa_config_dict["RESULT_DIR"]}/{xlib.get_toa_result_pipeline_dir()}/{pipeline_dataset_id}/{toa_config_dict["STATS_SUBDIR_NAME"]}/{stats_code}-{toa_config_dict["STATS_BASE_NAME"]}.csv'

        # open the statistics file
        if stats_file.endswith('.gz'):
            try:
                stats_file_id = gzip.open(stats_file,
                                          mode='rt',
                                          encoding='iso-8859-1',
                                          newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F002', stats_file)
        else:
            try:
                stats_file_id = open(stats_file,
                                     mode='r',
                                     encoding='iso-8859-1',
                                     newline='\n')
            except Exception as e:
                raise xlib.ProgramException('F001', stats_file)

        # initialize the record counter
        record_counter = 0

        # initialize the header record control
        header_record = True

        # read the first record
        record = stats_file_id.readline()

        # while there are records
        while record != '':

            # add 1 to the record counter
            record_counter += 1

            # process the header record
            if header_record:
                header_record = False

            # process data records
            else:

                # extract data
                # record format: "x_count";"y_count"
                data_list = []
                begin = 0
                for end in [i for i, chr in enumerate(record) if chr == ';']:
                    data_list.append(record[begin:end].strip('"'))
                    begin = end + 1
                data_list.append(record[begin:].strip('\n').strip('"'))
                try:
                    x_count = data_list[0]
                    y_count = data_list[1]
                except Exception as e:
                    raise xlib.ProgramException('F006',
                                                os.path.basename(stats_file),
                                                record_counter)

                # add dato to the dictionary
                distribution_dict[record_counter] = {
                    'x_count': x_count,
                    'y_count': y_count
                }

            # read the next record
            record = stats_file_id.readline()

    # print the distribution
    if OK:
        print(xlib.get_separator())
        if distribution_dict == {}:
            print('*** WARNING: There is not any stats data.')
        else:
            # set data width
            x_count_width = 15
            y_count_width = 15
            # set line template
            line_template = '{0:' + str(x_count_width) + '}   {1:' + str(
                y_count_width) + '}'
            # print header
            if stats_code == 'hit_per_hsp':
                print(line_template.format('# HSPs', '# HITs'))
            elif stats_code == 'seq_per_go':
                print(line_template.format('# GO terms', '# sequences'))
            elif stats_code == 'seq_per_ec':
                print(line_template.format('# EC ids', '# sequences'))
            elif stats_code == 'seq_per_interpro':
                print(line_template.format('# InterPro ids', '# sequences'))
            elif stats_code == 'seq_per_kegg':
                print(line_template.format('# KEGG ids', '# sequences'))
            elif stats_code == 'seq_per_mapman':
                print(line_template.format('# MapMan ids', '# sequences'))
            elif stats_code == 'seq_per_metacyc':
                print(line_template.format('# MetaCyc ids', '# sequences'))
            print(
                line_template.format('=' * x_count_width, '=' * y_count_width))
            # print detail lines
            for key in sorted(distribution_dict.keys()):
                print(
                    line_template.format(distribution_dict[key]['x_count'],
                                         distribution_dict[key]['y_count']))

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#5
0
def form_recreate_annotation_merger_config_file():
    '''
    Recreate the annotation merger config file.
    '''

    # initialize the control variable
    OK = True

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(
        f'{xlib.get_toa_process_merge_annotations_name()} - Recreate config file'
    )

    # get the identification of the first pipeline dataset
    app_list = [
        xlib.get_toa_process_pipeline_nucleotide_code(),
        xlib.get_toa_process_pipeline_aminoacid_code(),
        xlib.get_toa_process_merge_annotations_code()
    ]
    print('First pipeline ...')
    pipeline_dataset_id_1 = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id_1 == '':
        print('WARNING: There are not any pipeline datasets.')
        OK = False

    # get the identification of the second pipeline dataset
    app_list = [
        xlib.get_toa_process_pipeline_nucleotide_code(),
        xlib.get_toa_process_pipeline_aminoacid_code(),
        xlib.get_toa_process_merge_annotations_code()
    ]
    print('Second pipeline ...')
    pipeline_dataset_id_2 = cinputs.input_result_dataset_id(
        xlib.get_toa_result_pipeline_dir(), app_list)
    if pipeline_dataset_id_2 == '':
        print('WARNING: There are not any pipeline datasets.')
        OK = False
    elif pipeline_dataset_id_1 == pipeline_dataset_id_2:
        print('ERROR: The first pipeline dataset is equal to the second one.')
        OK = False

    # get the merger operation
    if OK:
        merger_operation = cinputs.input_code(
            text='Merger operation',
            code_list=xlib.get_annotation_merger_operation_code_list(),
            default_code=None).upper()

    # recreate the pipeline config file
    if OK:

        # confirm the creation of the config file
        print(xlib.get_separator())
        OK = clib.confirm_action(
            f'The file {xtoa.get_annotation_merger_config_file()} is going to be recreated. The previous files will be lost.'
        )

        # recreate the config file
        if OK:
            (OK, error_list) = xtoa.create_annotation_merger_config_file(
                pipeline_dataset_id_1, pipeline_dataset_id_2, merger_operation)
            if OK:
                print('The file is recreated.')
            else:
                for error in error_list:
                    print(error)

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')