示例#1
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "ngshelper_database"
    if args.ngshelper_database is None:
        xlib.Message.print('error', '*** The NGShelper database is not indicated in the input arguments.')
        OK = False

    # check "alignment_file"
    if args.alignment_file is None:
        xlib.Message.print('error', '*** The alignment file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.alignment_file):
        xlib.Message.print('error', f'*** The file {args.alignment_file} does not exist.')
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#2
0
def check_infrastructure_software():
    '''
    Check if the infrastructure software is setup.
    '''

    # initialize the control variable
    OK = True

    #check blastx
    if sys.platform.startswith('linux') or sys.platform.startswith('darwin'):
        command = 'blastx -h >/dev/null 2>&1'
    elif sys.platform.startswith('win32') or sys.platform.startswith('cygwin'):
        command = 'blastx.exe -h 1>null 2>&1'
    rc = subprocess.call(command, shell=True)
    if rc != 0:
        OK = False
        xlib.Message.print('error', 'blastx is not found.')

    # if there is software not found, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'I001')
示例#3
0
文件: ccloud.py 项目: GGFHF/NGScloud
def form_update_region_zone():
    '''
    Update the current region and zone names in the NGScloud config file
    corresponding to the envoronment.
    '''

    # initialize the control variable
    OK = True

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment('Configuration - Update region and zone')

    # input new current region and zone
    print(xlib.get_separator())
    region_name = cinputs.input_region_name(region_name, help=True)
    zone_name = cinputs.input_zone_name(region_name, zone_name, help=True)
  
    # get the NGScloud config file
    ngscloud_config_file = xconfiguration.get_ngscloud_config_file()
  
    # confirm the region and zone update in the NGScloud config file
    print(xlib.get_separator())
    OK = clib.confirm_action('The file {0} is going to be update with the new region and zone.'.format(ngscloud_config_file))

    # save the options dictionary in the NGScloud config file
    if OK:
        print(xlib.get_separator())
        print('The file {0} is being update with the new region and zone ...'.format(ngscloud_config_file))
        (OK, error_list) = xconfiguration.update_region_zone_data(region_name, zone_name)
        if OK:
            print('The config file has been update.')
        else:
            for error in error_list:
                print(error)
            raise xlib.ProgramException('C001')

    # show continuation message 
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#4
0
def get_genomic_features_dict(conn, transcript_seq_id, transcript_start,
                              transcript_end):
    '''
    Get a sequence feature dictionary from the table "genomic_features" corresponding to a sequence identification and its start less than or equal to the transcript start.
    '''

    # initialize the sequence feature dictionary
    genomic_feature_dict = {}

    # initialize the dictionary key
    key = 0

    # select rows from the table "genomic_features"
    sentence = f'''
                SELECT seq_id, start, end, type, gene
                    FROM genomic_features
                    WHERE seq_id = "{transcript_seq_id}"
                      AND start <= {transcript_start}
                      AND end >= {transcript_end};
                '''
    try:
        rows = conn.execute(sentence)
    except Exception as e:
        raise xlib.ProgramException(e, 'B002', sentence, conn)

    # add row data to the dictionary
    for row in rows:
        genomic_feature_dict[key] = {
            'seq_id': row[0],
            'start': row[1],
            'end': row[2],
            'type': row[3],
            'gene': row[4]
        }
        key += 1

    # return the sequence feature dictionary
    return genomic_feature_dict
示例#5
0
def calculate_haplotype_statistics(loci_file_path, stats_file_path):
    '''
    Calculates haplotype statistics per locus.
    '''

    # open the loci file
    try:
        loci_file_id = open(loci_file_path, mode='r', encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', loci_file_path)

    # set the pattern of the locus id records
    pattern1 = r'^\/\/(.*)\|(.*)\|$'

    # set the pattern of the locus information records
    pattern2 = r'^(.*) (.*)$'

    # initialize the list of locus information records
    locus_line_list = []

    # initialize the dictionary of haplotype sequence number by locus
    haplotype_number_by_locus_dict = {}

    # initialize the dictionary of haplotype sequences in the locus
    haplotype_seqs_in_locus_dict = {}

    # read the first record of complete loci file
    record = loci_file_id.readline()

    # while there are records
    while record != '':

        # process the locus id record
        if record.startswith('//'):

            # extract the locus id
            mo = re.search(pattern1, record)
            variant_seq = mo.group(1)
            locus_id = mo.group(2)

            # write in locus statistics
            for i in range(len(locus_line_list)):

                # extract the taxon id and sequence
                mo = re.search(pattern2, locus_line_list[i])
                taxon_id = mo.group(1).strip()
                sequence = mo.group(2).strip()

                # add the sequence to the dictionary of haplotype sequences in the locus
                if sequence not in haplotype_seqs_in_locus_dict:
                    haplotype_seqs_in_locus_dict[sequence] = sequence

            # calculate de variant sequence
            variant_seq = variant_seq[-len(sequence):]
            xlib.Message.print(
                'trace',
                f'locus_id: {locus_id:8} - variant_seq: >{variant_seq}<\n')

            # add the haplotype sequence number to the dictionary of haplotype sequence number by locus
            haplotype_number_by_locus_dict[locus_id] = len(
                haplotype_seqs_in_locus_dict.keys())

            # initialize the list of locus information records
            locus_line_list = []

            # initialize the dictionary of haplotype sequences in the locus
            haplotype_seqs_in_locus_dict = {}

        # process a locus information record
        else:

            # add the record to the list of locus information records
            locus_line_list.append(record)

        # read the next record of complete loci file
        record = loci_file_id.readline()

    # close file
    loci_file_id.close()

    # get a list of haplotype sequence number by locus sorted by locus identification
    haplotype_seqs_in_locus_list = sorted(
        haplotype_number_by_locus_dict.items(), key=operator.itemgetter(1))

    # open the statistics file
    try:
        print()
        with open(stats_file_path, mode='w',
                  encoding='iso-8859-1') as stats_file_id:
            stats_file_id.write('"haplotype number","locus identification"\n')
            for locus_info in haplotype_seqs_in_locus_list:
                stats_file_id.write(
                    f'{locus_info[1]},"locus_{locus_info[0]}"\n')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', stats_file_path)
示例#6
0
def check_args(args):
    '''
    Verity the input arguments data.
    '''

    # initialize the control variable
    OK = True

    # check the assembly_software_code value
    if args.assembly_software_code is None:
        xlib.Message.print(
            'error',
            '*** The assembly software that generated the transcritpme file is not indicated in the input arguments.'
        )
        OK = False
    elif args.assembly_software_code not in [
            xlib.Const.AS_TRINITY_CODE, xlib.Const.AS_SOAPDENOVOTRANS_CODE,
            xlib.Const.AS_GENERATED_BY_NGSCLOUD
    ]:
        xlib.Message.print(
            'error',
            f'*** {args.assembly_software_code} is not a valid code of assembly software.'
        )
        OK = False

    # check the transcriptome_file value
    if args.transcriptome_file is None:
        xlib.Message.print(
            'error',
            '*** A transcritpme file in Fasta format is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.transcriptome_file):
        xlib.Message.print(
            'error', f'*** The file {args.transcriptome_file} does not exist.')
        OK = False

    # check the score_file value
    if args.score_file is None:
        xlib.Message.print(
            'error',
            '*** A score file where RSEM-EVAL (DETONATE package) saved the score of the transcriptome file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.score_file):
        xlib.Message.print('error',
                           f'*** The file {args.score_file} does not exist.')
        OK = False

    # check the output_file value
    if args.output_file is None:
        xlib.Message.print(
            'error',
            '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.'
        )
        OK = False
    else:
        try:
            if not os.path.exists(os.path.dirname(args.output_file)):
                os.makedirs(os.path.dirname(args.output_file))
        except Exception as e:
            xlib.Message.print(
                'error',
                f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.'
            )
            OK = False

    # check the minlen value
    if args.minlen is None:
        args.minlen = xlib.Const.DEFAULT_MINLEN
    elif not xlib.check_int(args.minlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The minlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.minlen = int(args.minlen)

    # check the maxlen value
    if args.maxlen is None:
        args.maxlen = xlib.Const.DEFAULT_MAXLEN
    elif not xlib.check_int(args.maxlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The maxlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.maxlen = int(args.maxlen)

    # check the minFPKM value
    if args.minFPKM is None:
        args.minFPKM = xlib.Const.DEFAULT_MINFPKM
    elif not xlib.check_float(args.minFPKM, minimum=0.0):
        print(
            '*** FPKM has to be a float number greater than or equal to 0.0.')
        OK = False
    else:
        args.minFPKM = float(args.minFPKM)

    # check the minTPM value
    if args.minTPM is None:
        args.minTPM = xlib.Const.DEFAULT_MINTPM
    elif not xlib.check_float(args.minTPM, minimum=0.0):
        print(
            '*** FPKM has to be a float number greater than or equal to 0.0.')
        OK = False
    else:
        args.minTPM = float(args.minTPM)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check if maxlen value is greater or equal than minlen value
    if OK:
        if args.maxlen < args.minlen:
            xlib.Message.print(
                'error',
                '*** The maxlen value has to be greater than or equal to minlen.'
            )
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#7
0
def load_table_ec_ids(conn, ec_id_file):
    '''
    '''

    # drop table "ec_ids" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "ec_ids" ...\n')
    xsqlite.drop_ec_ids(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "ec_ids"
    xlib.Message.print('verbose', 'Creating the table "ec_ids" ...\n')
    xsqlite.create_ec_ids(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the EC id file
    if ec_id_file.endswith('.gz'):
        try:
            ec_id_file_id = gzip.open(ec_id_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', ec_id_file)
    else:
        try:
            ec_id_file_id = open(ec_id_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', ec_id_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = ec_id_file_id.readline()

    # while there are records and they are the header
    while record != '' and not record.startswith('ID'):

        # add 1 to record counter
        record_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = ec_id_file_id.readline()

    # if there is a first definition block
    if record.startswith('ID'):

        # while there are records and the record is an identification
        while record != '':

            # when the record is an identification
            if record.startswith('ID'):

                # add 1 to record counter
                record_counter += 1

                # initialize the row dictionary
                row_dict = {}
                row_dict['ec_id'] = record[3:].strip()
                row_dict['desc'] = ''

                # print record counter
                xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}')

                # read the next record
                record = ec_id_file_id.readline()

            # while there are records and the record is a definition
            while record != '' and record.startswith('DE'):

                # add 1 to record counter
                record_counter += 1

                # concat the description
                if row_dict['desc'] == '':
                    row_dict['desc'] = record[3:].strip()
                else:
                    row_dict['desc'] = f'''{row_dict['desc']}, {record[3:].strip()}'''

                # change quotation marks and semicolons in "desc"
                row_dict['desc'] = row_dict['desc'].replace("'", '|').replace(';', ',')

                # print record counter
                xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}')

                # read the next record
                record = ec_id_file_id.readline()

            # insert data into table "ec_ids"
            row_dict['desc'] = row_dict['desc'][:-1]
            xsqlite.insert_ec_ids_row(conn, row_dict)
            inserted_row_counter += 1

            # while there are records and the record is not an identification and is not a definition
            while record != '' and not record.startswith('ID') and not record.startswith('DE'):

                # add 1 to record counter
                record_counter += 1

                # print record counter
                xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}')

                # read the next record
                record = ec_id_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close EC id file
    ec_id_file_id.close()

    # create the index on the table "ec_ids"
    xlib.Message.print('verbose', 'Creating the index on the table "ec_ids" ...\n')
    xsqlite.create_ec_ids_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')
示例#8
0
def load_table_datasets(conn, dataset_file):
    '''
    '''

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # set the pattern of the data records
    # format: "repository_id";"dataset_id";"dataset_name";"ftp_adress"
    record_pattern = re.compile(r'^"(.*)";"(.*)";"(.*)";"(.*)"$')
    
    # drop table "datasets"
    xlib.Message.print('verbose', 'Droping the table "datasets" ...\n')
    xsqlite.drop_datasets(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')
    
    # create table "datasets"
    xlib.Message.print('verbose', 'Creating the table "datasets" ...\n')
    xsqlite.create_datasets(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the file of datasets
    if dataset_file.endswith('.gz'):
        try:
            dataset_file_id = gzip.open(dataset_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', dataset_file)
    else:
        try:
            dataset_file_id = open(dataset_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', dataset_file)

    # read the first record
    record = dataset_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.lstrip().startswith('#') and record.strip() != '':

            # initialize the row data dictionary
            row_dict = {}

            # extract data
            try:
                mo = record_pattern.match(record)
                row_dict['dataset_id'] = mo.group(1).strip().lower()
                row_dict['dataset_name'] = mo.group(2).strip()
                row_dict['repository_id'] = mo.group(3).strip().lower()
                row_dict['ftp_adress'] = mo.group(4).strip()
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(dataset_file), record_counter)

            # review null values of "ftp_adress"
            if row_dict['ftp_adress'] == '':
                row_dict['ftp_adress'] = xlib.get_na()

            # insert data into table "datasets"
            xsqlite.insert_datasets_row(conn, row_dict)
            inserted_row_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of dataset file: {record_counter} - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = dataset_file_id.readline()

    xlib.Message.print('verbose', '\n')
    
    # create the index on the table "datasets"
    xlib.Message.print('verbose', 'Creating the index on the table "datasets" ...\n')
    xsqlite.create_datasets_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close dataset file
    dataset_file_id.close()
示例#9
0
def save_annotation_file_merger_format(annotation_file_1, type_1, merger_file, header):
    '''
    Save a annotation file with record format "PLAZA", "REFSEQ", "NT" or "NR" in record format "MERGER".
    '''

    # open the annotation file
    if annotation_file_1.endswith('.gz'):
        try:
            annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file_1)
    else:
        try:
            annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file_1)

    # open the merger file
    if merger_file.endswith('.gz'):
        try:
            merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F004', merger_file)
    else:
        try:
            merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F003', merger_file)

    # initialize record counters
    read_record_counter_1 = 0
    written_record_counter = 0
          
    # print header record in merged file if necessary
    if header == 'Y':
        xlib.write_annotation_header(merger_file_id, 'MERGER')
        written_record_counter += 1

    # read the first record of the annotation file
    (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
    xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

    # while there are records in annotation file
    while record_1 != '':

            # add 1 to record counter
            read_record_counter_1 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the annotation file
            (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
            xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

    # print summary
    xlib.Message.print('verbose', '\n')
    xlib.Message.print('info', f'{read_record_counter_1} read records in the annotation file.')
    xlib.Message.print('info', f'{written_record_counter} written records in the merged annotation file.')

    # close files
    annotation_file_1_id.close()
    merger_file_id.close()
示例#10
0
def merge_files_operation_1and2(annotation_file_1, type_1, annotation_file_2, type_2, merger_file, header):
    '''
    Merge annotation files with operation "1AND2" (annotations included in both files).
    '''

    # open the first annotation file
    if annotation_file_1.endswith('.gz'):
        try:
            annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file_1)
    else:
        try:
            annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file_1)

    # open the second annotation file
    if annotation_file_2.endswith('.gz'):
        try:
            annotation_file_2_id = gzip.open(annotation_file_2, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file_2)
    else:
        try:
            annotation_file_2_id = open(annotation_file_2, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file_2)

    # open the merged annotation file
    if merger_file.endswith('.gz'):
        try:
            merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F004', merger_file)
    else:
        try:
            merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F003', merger_file)

    # initialize record counters
    read_record_counter_1 = 0
    read_record_counter_2 = 0
    written_record_counter = 0

    # print header record in merged file if necessary
    if header == 'Y':
        xlib.write_annotation_header(merger_file_id, 'MERGER')
        written_record_counter += 1

    # read the first record of the first annotation file
    (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
    xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

    # read the first record of the second annotation file
    (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
    xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

    # while there are records in any annotation file
    while record_1 != '' or record_2 != '':

        # while there are records in the first annotation file and key of the first annotation file is less then the key of the second annotation file
        while record_1 != '' and (record_2 != '' and key_1 < key_2 or record_2 == ''):

            # add 1 to record counter
            read_record_counter_1 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the first annotation file
            (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
            xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

        # while there are records in both annotation files and key of the first annotation file is equal to the key of the second annotation file
        while record_1 != '' and record_2 != '' and key_1 == key_2:

            # add 1 to record counter
            read_record_counter_1 += 1

            # write the first file record in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the first annotation file
            (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
            xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

            # write the second file record in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the second annotation file
            (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
            xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

        # while there are records in the second annotation file and key of the first annotation file is greater than the key of the second annotation file
        while record_2 != '' and (record_1 != '' and key_1 > key_2 or record_1 == ''):

            # add 1 to record counter
            read_record_counter_2 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the second annotation file
            (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
            xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

    # print summary
    xlib.Message.print('verbose', '\n')
    xlib.Message.print('info', f'{read_record_counter_1} records read from the first annotation file.')
    xlib.Message.print('info', f'{read_record_counter_2} records read from the second annotation file.')
    xlib.Message.print('info', f'{written_record_counter} records written in the merged annotation file.')

    # close files
    annotation_file_1_id.close()
    annotation_file_2_id.close()
    merger_file_id.close()
示例#11
0
def load_table_blast_5(conn, dataset_id, blast_file):
    '''
    '''

    # check if BLAST file is not empty
    try:
        blast_file_id = open(blast_file, mode='r', encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException('F001', blast_file)
    record = blast_file_id.readline()
    if record == '':
        return

    # initialize the iteration counter
    iteration_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # create table "blast"
    xlib.Message.print(
        'verbose', 'Creating the table "blast" (if it does not exist) ...\n')
    xsqlite.create_blast(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # create the index on the table "blast"
    xlib.Message.print(
        'verbose',
        'Creating the index on the table "blast" (if it does not exist) ...\n')
    xsqlite.create_blast_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # delete files from table "blast" corresponding to the repository and dataset identification
    xlib.Message.print('verbose',
                       'Deleting previous rows from the table "blast" ...\n')
    xsqlite.delete_blast_rows(conn, dataset_id)
    xlib.Message.print('verbose', 'Rows are deleted.\n')

    # build the complee item tree from BLAST XML file
    tree = xml.etree.ElementTree.parse(blast_file)
    root = tree.getroot()

    # walk the tree and insert data into table "blast" for each iteration-hit-hsp
    for item_blastoutput_iterations in root.iter(tag='BlastOutput_iterations'):
        xlib.Message.print(
            'verbose',
            f'-> tag: {item_blastoutput_iterations.tag} - attrib: {item_blastoutput_iterations.attrib} - text: {item_blastoutput_iterations.text}\n'
        )

        # get items "Iteration"
        for item_iteration in item_blastoutput_iterations.iter(
                tag='Iteration'):
            xlib.Message.print(
                'verbose',
                f'---> tag: {item_iteration.tag} - attrib: {item_iteration.attrib} - text: {item_iteration.text}\n'
            )

            # initialize the row data dictionary
            row_dict = {}
            row_dict['dataset_id'] = dataset_id

            # add 1 to iteration counter
            iteration_counter += 1

            # initialize iteration data
            iteration_iter_num = 0
            iteration_query_def = ''

            # get data of item "Iteration_iter-num"
            for item_iteration_iter_num in item_iteration.iter(
                    tag='Iteration_iter-num'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_iter_num.tag} - attrib: {item_iteration_iter_num.attrib} - text: {item_iteration_iter_num.text}\n'
                )
                row_dict['iteration_iter_num'] = int(
                    item_iteration_iter_num.text)

            # get data of item "Iteration_query-def"
            for item_iteration_query_def in item_iteration.iter(
                    tag='Iteration_query-def'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_query_def.tag} - attrib: {item_iteration_query_def.attrib} - text: {item_iteration_query_def.text}\n'
                )
                row_dict['iteration_query_def'] = item_iteration_query_def.text

            # get items "Iteration_hits"
            for item_iteration_hits in item_iteration.iter(
                    tag='Iteration_hits'):
                xlib.Message.print(
                    'verbose',
                    f'-----> tag: {item_iteration_hits.tag} - attrib: {item_iteration_hits.attrib} - text: {item_iteration_hits.text}\n'
                )

                # get items "Hit"
                for item_hit in item_iteration_hits.iter(tag='Hit'):
                    xlib.Message.print(
                        'verbose',
                        f'-------> tag: {item_hit.tag} - attrib: {item_hit.attrib} - text: {item_hit.text}'
                    )

                    # initialize hit data
                    row_dict['hit_num'] = 0
                    row_dict['hit_id'] = xlib.get_na()
                    row_dict['hit_def'] = xlib.get_na()
                    row_dict['hit_accession'] = xlib.get_na()

                    # get data of item "Hit_num"
                    for item_hit_num in item_hit.iter(tag='Hit_num'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_num.tag} - attrib: {item_hit_num.attrib} - text: {item_hit_num.text}\n'
                        )
                        row_dict['hit_num'] = int(item_hit_num.text)

                    # get data of item "Hit_id"
                    for item_hit_id in item_hit.iter(tag='Hit_id'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_id.tag} - attrib: {item_hit_id.attrib} - text: {item_hit_id.text}\n'
                        )
                        row_dict['hit_id'] = item_hit_id.text

                    # get data of item "Hit_def"
                    for item_hit_def in item_hit.iter(tag='Hit_def'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_def.tag} - attrib: {item_hit_def.attrib} - text: {item_hit_def.text}\n'
                        )
                        try:
                            row_dict['hit_def'] = item_hit_def.text.replace(
                                "'", '|').replace(';', ',')
                        except:
                            row_dict['hit_def'] = item_hit_def.text

                    # get data of item "Hit_accession"
                    for item_hit_accession in item_hit.iter(
                            tag='Hit_accession'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_accession.tag} - attrib: {item_hit_accession.attrib} - text: {item_hit_accession.text}\n'
                        )
                        row_dict['hit_accession'] = item_hit_accession.text

                    # get items "Hit_hsps"
                    for item_hit_hsps in item_hit.iter(tag='Hit_hsps'):
                        xlib.Message.print(
                            'verbose',
                            f'---------> tag: {item_hit_hsps.tag} - attrib: {item_hit_hsps.attrib} - text: {item_hit_hsps.text}\n'
                        )

                        # get items "Hsp"
                        for item_hsp in item_hit.iter(tag='Hsp'):
                            xlib.Message.print(
                                'verbose',
                                f'-----------> tag: {item_hsp.tag} - attrib: {item_hsp.attrib} - text: {item_hsp.text}\n'
                            )

                            # initialize hsp data
                            row_dict['hsp_num'] = 0
                            row_dict['hsp_evalue'] = 0.
                            row_dict['hsp_identity'] = 0
                            row_dict['hsp_positive'] = 0
                            row_dict['hsp_gaps'] = 0
                            row_dict['hsp_align_len'] = 0
                            row_dict['hsp_qseq'] = ''

                            # get data of item "Hsp_num"
                            for item_hsp_num in item_hsp.iter(tag='Hsp_num'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_num.tag} - attrib: {item_hsp_num.attrib} - text: {item_hsp_num.text}\n'
                                )
                                row_dict['hsp_num'] = int(item_hsp_num.text)

                            # get data of item "Hsp_evalue"
                            for item_hsp_evalue in item_hsp.iter(
                                    tag='Hsp_evalue'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_evalue.tag} - attrib: {item_hsp_evalue.attrib} - text: {item_hsp_evalue.text}\n'
                                )
                                row_dict['hsp_evalue'] = float(
                                    item_hsp_evalue.text)

                            # get data of item "Hsp_identity"
                            for item_hsp_identity in item_hsp.iter(
                                    tag='Hsp_identity'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_identity.tag} - attrib: {item_hsp_identity.attrib} - text: {item_hsp_identity.text}\n'
                                )
                                row_dict['hsp_identity'] = int(
                                    item_hsp_identity.text)

                            # get data of item "Hsp_positive"
                            for item_hsp_positive in item_hsp.iter(
                                    tag='Hsp_positive'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_positive.tag} - attrib: {item_hsp_positive.attrib} - text: {item_hsp_positive.text}\n'
                                )
                                row_dict['hsp_positive'] = int(
                                    item_hsp_positive.text)

                            # get data of item "Hsp_gaps"
                            for item_hsp_gaps in item_hsp.iter(tag='Hsp_gaps'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_gaps.tag} - attrib: {item_hsp_gaps.attrib} - text: {item_hsp_gaps.text}\n'
                                )
                                row_dict['hsp_gaps'] = int(item_hsp_gaps.text)

                            # get data of item "Hsp_align-len"
                            for item_hsp_align_len in item_hsp.iter(
                                    tag='Hsp_align-len'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_align_len.tag} - attrib: {item_hsp_align_len.attrib} - text: {item_hsp_align_len.text}\n'
                                )
                                row_dict['hsp_align_len'] = int(
                                    item_hsp_align_len.text)

                            # get data of item "Hsp_qseq"
                            for item_hsp_qseq in item_hsp.iter(tag='Hsp_qseq'):
                                xlib.Message.print(
                                    'verbose',
                                    f'-------------> tag: {item_hsp_qseq.tag} - attrib: {item_hsp_qseq.attrib} - text: {item_hsp_qseq.text}\n'
                                )
                                row_dict['hsp_qseq'] = item_hsp_qseq.text

                            # insert data into table "blast"
                            xsqlite.insert_blast_row(conn, row_dict)
                            inserted_row_counter += 1

            # print iteration counter
            xlib.Message.print(
                'verbose',
                f'\rIterations: {iteration_counter} - Inserted rows: {inserted_row_counter}'
            )

    xlib.Message.print('verbose', '\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')
示例#12
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "toa_database"
    if args.toa_database is None:
        xlib.Message.print(
            'error',
            '*** The TOA database is not indicated in the input arguments.')
        OK = False

    # check "dataset_id"
    if args.dataset_id is None:
        xlib.Message.print(
            'error',
            '*** The dataset identification is not indicated in the input arguments.'
        )
        OK = False
    else:
        args.dataset_id = args.dataset_id.lower()

    # check "blast_file_format"
    if args.blast_file_format is None:
        xlib.Message.print(
            'error',
            '*** The BLAST file format is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_code(args.blast_file_format,
                             xlib.get_blast_file_format_code_list(),
                             case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The BLAST file format has to be {xlib.get_blast_file_format_code_list_text()}.'
        )
        OK = False

    # check "blast_file"
    if args.blast_file is None:
        xlib.Message.print(
            'error',
            '*** The BLAST file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.blast_file):
        xlib.Message.print('error',
                           f'*** The file {args.blast_file} does not exist.')
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('P001')
示例#13
0
def extract_annotations(annotation_file, type, id_file, extract_file,
                        stats_file):
    '''
    '''

    # get the identification data
    (id_list, id_dict) = get_id_data(id_file)

    # open the annotation file
    if annotation_file.endswith('.gz'):
        try:
            annotation_file_id = gzip.open(annotation_file,
                                           mode='rt',
                                           encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file)
    else:
        try:
            annotation_file_id = open(annotation_file,
                                      mode='r',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file)

    # open the extracted identification file
    if extract_file.endswith('.gz'):
        try:
            extract_file_id = gzip.open(extract_file,
                                        mode='wt',
                                        encoding='iso-8859-1',
                                        newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F004', extract_file)
    else:
        try:
            extract_file_id = open(extract_file,
                                   mode='w',
                                   encoding='iso-8859-1',
                                   newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F003', extract_file)

    # initialize record counters
    read_record_counter = 0
    written_record_counter = 0

    # write header record in the extracted identification file
    xlib.write_annotation_header(extract_file_id, type)
    written_record_counter += 1

    # read the first record of the annotation file (header)
    read_record_counter += 1
    (record, key,
     data_dict) = xlib.read_annotation_record(annotation_file,
                                              annotation_file_id, type,
                                              read_record_counter)
    xlib.Message.print('trace', f'key: {key} - record: {record}')

    # while there are records
    while record != '':

        # get the identification of the current record
        id = key

        # this sentence block is only used in a particular case
        if key.startswith('CUFF'):
            first_dot_position = key.find('.')
            second_dot_position = key.find('.', first_dot_position + 1)
            id = key[:second_dot_position]
        elif key.startswith('scaffold'):
            id = key[:key.find(' ')]

        # if the key is in the identification list
        if id in id_list:

            # add 1 to the annotation counter of the identification
            id_dict[id] += 1

            # write in the extracted identification file
            xlib.write_merged_annotation_record(extract_file_id, type,
                                                data_dict)
            written_record_counter += 1

        xlib.Message.print(
            'verbose',
            f'\rRead annotations: {read_record_counter} - Written annotations: {written_record_counter}'
        )

        # read the next record of the annotation file
        read_record_counter += 1
        (record, key,
         data_dict) = xlib.read_annotation_record(annotation_file,
                                                  annotation_file_id, type,
                                                  read_record_counter)
        xlib.Message.print('trace', f'key: {key} - record: {record}')

    xlib.Message.print('verbose', '\n')

    # print summary
    xlib.Message.print(
        'info',
        f'{read_record_counter - 1} annotations read in annotation file.')
    xlib.Message.print(
        'info',
        f'{written_record_counter} annotations written in the extracted identification file.'
    )

    # close files
    annotation_file_id.close()
    extract_file_id.close()

    # write stats
    write_stats(stats_file, id_list, id_dict)
示例#14
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "annotation_file"
    if args.annotation_file is None:
        xlib.Message.print(
            'error',
            '*** The annotation file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.annotation_file):
        xlib.Message.print(
            'error', f'*** The file {args.annotation_file} does not exist.')
        OK = False

    # check "type"
    if args.type is None:
        xlib.Message.print(
            'error',
            '*** The type of annotation file is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_code(
            args.type, xlib.get_type_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.'
        )
        OK = False
    else:
        args.type = args.type.upper()

    # check "id_file"
    if args.id_file is None:
        xlib.Message.print(
            'error',
            '*** The identification file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.id_file):
        xlib.Message.print('error',
                           f'*** The file {args.id_file} does not exist.')
        OK = False

    # check "extract_file"
    if args.extract_file is None:
        xlib.Message.print(
            'error',
            '*** The extracted annotation file is not indicated in the input arguments.'
        )
        OK = False

    # check "stats_file"
    if args.stats_file is None:
        xlib.Message.print(
            'error',
            '*** The statistics file is not indicated in the input arguments.')
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('P001')
示例#15
0
文件: ccloud.py 项目: GGFHF/NGScloud
def form_set_environment():
    '''
    Set the environment.
    '''

    # print headers
    clib.clear_screen()
    clib.print_headers_without_environment('Set environment')
    # -- print('function name: {0}'.format(sys._getframe().f_code.co_name))

    # initialize the environment and the input environment
    xconfiguration.environment = ''
    environment = ''

    # get the current environments list
    environments_list = xconfiguration.get_environments_list()

    # print the available region names
    if environments_list != []:
        print('Current environments list: {0} ...'.format(str(environments_list).strip('[]').replace('\'', '')))
        input_text = '... Enter the environment name: '
    else:
        print('Currently there is not any environment recorded.')
        input_text = 'Enter a new environment name: '

    # input and validate the environment
    while xconfiguration.environment == '':
        xconfiguration.environment = input(input_text)
        if xconfiguration.environment not in environments_list:
            print(xlib.get_separator())
            anwser = input('{0} is not a recorded environment. Do you like to record it? (Y/N): '.format(xconfiguration.environment))
            if anwser not in ['Y', 'y']:
                xconfiguration.environment = ''
            else:
                (OK, error_list) = xconfiguration.add_environment(xconfiguration.environment)
                if not OK:
                    for error in error_list:
                        print(error)
                    raise xlib.ProgramException('C002')

    # check if it is necesary to create the NGScloud config file corresponding to the environment
    if not xconfiguration.is_ngscloud_config_file_created():

        print(xlib.get_separator())
        print('Creating the config files ...')

        # create the NGScloud config file
        form_create_ngscloud_config_file(is_menu_call=False)

        # create the key pairs directory
        if not os.path.exists(xlib.get_keypairs_dir()):
            os.makedirs(xlib.get_keypairs_dir())

        # create the BUSCO config file
        (OK, error_list) = xbusco.create_busco_config_file()

        # create the CD-HIT-EST config file
        (OK, error_list) = xcdhit.create_cd_hit_est_config_file()

        # create the FastQC config file
        (OK, error_list) = xfastqc.create_fastqc_config_file()

        # create the GMAP config file
        (OK, error_list) = xgmap.create_gmap_config_file()

        # create the insilico_read_normalization config file
        (OK, error_list) = xtrinity.create_insilico_read_normalization_config_file()

        # create the QUAST config file
        (OK, error_list) = xquast.create_quast_config_file()

        # create the REF-EVAL config file
        (OK, error_list) = xdetonate.create_ref_eval_config_file()

        # create the rnaQUAST config file
        (OK, error_list) = xrnaquast.create_rnaquast_config_file()

        # create the RSEM-EVAL config file
        (OK, error_list) = xdetonate.create_rsem_eval_config_file()

        # create the SOAPdenovo-Trans config file
        (OK, error_list) = xsoapdenovotrans.create_soapdenovotrans_config_file()

        # create the STAR config file
        (OK, error_list) = xstar.create_star_config_file()

        # create the Trans-ABySS config file
        (OK, error_list) = xtransabyss.create_transabyss_config_file()

        # create the transcript-filter config file
        (OK, error_list) = xngshelper.create_transcript_filter_config_file()

        # create the transcriptome-blastx config file
        (OK, error_list) = xngshelper.create_transcriptome_blastx_config_file()

        # create the Transrate config file
        (OK, error_list) = xtransrate.create_transrate_config_file()

        # create the Trimmomatic config file
        (OK, error_list) = xtrimmomatic.create_trimmomatic_config_file()

        # create the Trinity config file
        (OK, error_list) = xtrinity.create_trinity_config_file()

        # create the transfer config files
        (OK, error_list) = xreference.create_reference_transfer_config_file()
        (OK, error_list) = xdatabase.create_database_transfer_config_file()
        (OK, error_list) = xread.create_read_transfer_config_file()
        (OK, error_list) = xresult.create_result_transfer_config_file(status='uncompressed')

        # create the gzip config files
        (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='reference')
        (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='database')
        (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='read')
        (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='result')

    # set the environment variables corresponding to the NGScloud config file, the AWS access key identification,
    # AWS secret access key and the current region name
    print(xlib.get_separator())
    print('Setting the environment variables ...')
    xconfiguration.set_environment_variables()
    print('The environment variables are set.')

    # show continuation message 
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
示例#16
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "ngshelper_database"
    if args.ngshelper_database is None:
        xlib.Message.print('error', '*** The NGShelper database is not indicated in the input arguments.')
        OK = False

    # check "vcf_file"
    if args.vcf_file is None:
        xlib.Message.print('error', '*** The VCF file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.vcf_file):
        xlib.Message.print('error', f'*** The file {args.vcf_file} does not exist.')
        OK = False

    # check "sample_file"
    if args.sample_file is None:
        xlib.Message.print('error', '*** The sample file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.sample_file):
        xlib.Message.print('error', f'*** The file {args.sample_file} does not exist.')
        OK = False

    # check "sp1_id"
    if args.sp1_id is None:
        xlib.Message.print('error', '*** The identification of the first species is not indicated in the input arguments.')
        OK = False

    # check "sp2_id"
    if args.sp2_id is None:
        xlib.Message.print('error', '*** The identification of the second species is not indicated in the input arguments.')
        OK = False

    # check "hybrid_id"
    if args.hybrid_id is None:
        args.hybrid_id = 'NONE'

    # check "imputed_md_id"
    if args.imputed_md_id is None:
        args.imputed_md_id = xlib.Const.DEFAULT_IMPUTED_MD_ID

    # check "new_md_id"
    if args.new_md_id is None:
        args.new_md_id = xlib.Const.DEFAULT_NEW_MD_ID

    # check "allele_transformation"
    if args.allele_transformation is None:
        args.allele_transformation = 'NONE'
    elif not xlib.check_code(args.allele_transformation, xlib.get_allele_transformation_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.')
        OK = False
    else:
        args.allele_transformation = args.allele_transformation.upper()

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check "tvi_list"
    if args.tvi_list is None or args.tvi_list == 'NONE':
        args.tvi_list = []
    else:
        args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list)

    # check the identification set
    if OK:
        if args.sp1_id == args.sp2_id or \
           args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id):
            xlib.Message.print('error', 'The identifications must be different.')
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#17
0
def get_ko_annotations(transcripts_with_ko_file, annotation_dict):
    '''
    '''

    # initialize the record counter
    record_counter = 0

    # open the transcripts with KO file
    if transcripts_with_ko_file.endswith('.gz'):
        try:
            transcripts_with_ko_file_id = gzip.open(transcripts_with_ko_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', transcripts_with_ko_file)
    else:
        try:
            transcripts_with_ko_file_id = open(transcripts_with_ko_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', transcripts_with_ko_file)

    # read the first record
    record = transcripts_with_ko_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.startswith('#'):

            # extract data 
            # record format: counter	transcript_id	ko	description
            data_list = []
            begin = 0
            for end in [i for i, chr in enumerate(record) if chr == '\t']:
                data_list.append(record[begin:end].strip())
                begin = end + 1
            data_list.append(record[begin:].strip('\n').strip())
            try:
                transcript_id = data_list[1]
                ko = data_list[2]
                description = data_list[3]
            except Exception as e:
                raise xlib.ProgramException(e, 'F006', os.path.basename(transcripts_with_ko_file), record_counter)

            # change quotation marks in "description"
            description = description.replace("'", '|')

            # insert data into annotation dictionary
            go_id_w = annotation_dict.get(transcript_id, {}).get('go_id', '')
            go_desc_w = annotation_dict.get(transcript_id, {}).get('go_desc', '')
            gf_id_w = annotation_dict.get(transcript_id, {}).get('gf_id', '')
            ko_id_w = annotation_dict.get(transcript_id, {}).get('ko_id', '')
            ko_id_w = ko if ko_id_w == '' else f'{ko_id_w}*{ko}'
            ko_desc_w = annotation_dict.get(transcript_id, {}).get('ko_desc', '')
            ko_desc_w = description if ko_desc_w == '' else f'{ko_desc_w}*{description}'
            annotation_dict[transcript_id] = {'go_id': go_id_w, 'go_desc': go_desc_w, 'gf_id': gf_id_w, 'ko_id': ko_id_w, 'ko_desc': ko_desc_w}

            # print counters
            xlib.Message.print('verbose', f'\rProcessed records of transcripts with KO file: {record_counter}')

        # read the next record
        record = transcripts_with_ko_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close transcripts with KO file
    transcripts_with_ko_file_id.close()

    # return the annotation dictionary
    return annotation_dict
示例#18
0
def convert_simhyb_to_structure(simhyb_file, header_row_number,
                                structure_file):
    '''
    Convert a output SimHyb file to the input Structure format in two lines.
    '''

    # initialize the loci number
    loci_number = -1

    # open the SimHyb file
    if simhyb_file.endswith('.gz'):
        try:
            simhyb_file_id = gzip.open(simhyb_file,
                                       mode='rt',
                                       encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', simhyb_file)
    else:
        try:
            simhyb_file_id = open(simhyb_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', simhyb_file)

    # open the Structure file
    if structure_file.endswith('.gz'):
        try:
            structure_file_id = gzip.open(structure_file,
                                          mode='wt',
                                          encoding='iso-8859-1',
                                          newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', structure_file)
    else:
        try:
            structure_file_id = open(structure_file,
                                     mode='w',
                                     encoding='iso-8859-1',
                                     newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', structure_file)

    # initialize record counters
    input_record_counter = 0
    written_record_counter = 0

    # read the first record of the SimHyb file
    record = simhyb_file_id.readline()

    # while there are records in the VCF file
    while record != '':

        # add 1 to input record counter
        input_record_counter += 1

        # when the record has data
        if input_record_counter > header_row_number:

            # extract data
            data_list = []
            start = 0
            for end in [i for i, chr in enumerate(record) if chr == '\t']:
                data_list.append(record[start:end].strip())
                start = end + 1
            last_data = record[start:].strip('\n').strip()
            if last_data != '':
                data_list.append(record[start:].strip('\n').strip())

            # check the loci number
            if loci_number == -1:
                loci_number = len(data_list) - 12
                if (loci_number % 2) == 1:
                    raise xlib.ProgramException('', 'L011')
            elif loci_number != len(data_list) - 12:
                raise xlib.ProgramException('', 'L012')

            # get left and righ genotype lists of loci
            gt_left_list = []
            gt_right_list = []
            for i in range(12, len(data_list)):
                if (i % 2) == 0:
                    gt_left_list.append(data_list[i])
                else:
                    gt_right_list.append(data_list[i])

            # write the record corresponding to the left genotype list
            gt_left_list_text = '\t'.join(gt_left_list)
            structure_file_id.write(
                f'{data_list[0]}\t{data_list[1]}\t{gt_left_list_text}\n')
            written_record_counter += 1

            # write the record corresponding to the right genotype list
            gt_right_list_text = '\t'.join(gt_right_list)
            structure_file_id.write(
                f'{data_list[0]}\t{data_list[1]}\t{gt_right_list_text}\n')
            written_record_counter += 1

        # print the counters
        xlib.Message.print(
            'verbose',
            f'\rProcessed SimHyb records ... {input_record_counter:8d} - Written Structure records ... {written_record_counter:8d}'
        )

        # read the next record of the SimHyb file
        record = simhyb_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close file
    simhyb_file_id.close()
    structure_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The converted file {os.path.basename(structure_file)} is created.')
示例#19
0
def merge_files_operation_1best(annotation_file_1, type_1, annotation_file_2, type_2, merger_file, header):
    '''
    Merge annotation files with operation "1BEST" (all annotations of the first file and
    annotations if the second file if their seq id is not in the first).
    '''

    # open the first annotation file
    if annotation_file_1.endswith('.gz'):
        try:
            annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file_1)
    else:
        try:
            annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file_1)

    # open the second annotation file
    if annotation_file_2.endswith('.gz'):
        try:
            annotation_file_2_id = gzip.open(annotation_file_2, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', annotation_file_2)
    else:
        try:
            annotation_file_2_id = open(annotation_file_2, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', annotation_file_2)

    # open the merged annotation file
    if merger_file.endswith('.gz'):
        try:
            merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F004', merger_file)
    else:
        try:
            merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException('F003', merger_file)

    # initialize record counters
    read_record_counter_1 = 0
    read_record_counter_2 = 0
    written_record_counter = 0

    # print header record in merged file if necessary
    if header == 'Y':
        xlib.write_annotation_header(merger_file_id, 'MERGER')
        written_record_counter += 1

    # read the first record of the first annotation file
    (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
    xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

    # read the first record of the second annotation file
    (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
    xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

    # while there are records in any annotation file
    # (the first compound of the key, the sequence identification of transcripts nt_seq_id, is only considered in this processing)
    while record_1 != '' or record_2 != '':

        # while there are records in the first annotation file and key of the first annotation file is less then the key of the second annotation file
        while record_1 != '' and (record_2 != '' and data_dict_1['nt_seq_id'] < data_dict_2['nt_seq_id'] or record_2 == ''):

            # add 1 to record counter
            read_record_counter_1 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the first annotation file
            (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
            xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

        # while there are records in the first annotation file and key of the first annotation file is equal to the key of the second annotation file
        while record_1 != '' and record_2 != '' and data_dict_1['nt_seq_id'] == data_dict_2['nt_seq_id']:

            # add 1 to record counter
            read_record_counter_1 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read next records of the second annotation file while their key is equal to the key of the first annotation file
            while record_2 != '' and data_dict_1['nt_seq_id'] == data_dict_2['nt_seq_id']:
                (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
                xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

            # read the next record of the first annotation file
            (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1)
            xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}')

        # while there are records in the second annotation file and key of the first annotation file is greater than the key of the second annotation file
        while record_2 != '' and (record_1 != '' and data_dict_1['nt_seq_id'] > data_dict_2['nt_seq_id'] or record_1 == ''):

            # add 1 to record counter
            read_record_counter_2 += 1

            # write in the merged annotation file
            xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2)
            written_record_counter += 1
            xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}')

            # read the next record of the second annotation file
            (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2)
            xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}')

    # print summary
    xlib.Message.print('verbose', '\n')
    xlib.Message.print('info', f'{read_record_counter_1} records read from the first annotation file.')
    xlib.Message.print('info', f'{read_record_counter_2} records read from the second annotation file.')
    xlib.Message.print('info', f'{written_record_counter} records written in the merged annotation file.')

    # close files
    annotation_file_1_id.close()
    annotation_file_2_id.close()
    merger_file_id.close()
示例#20
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "simhyb_file"
    if args.simhyb_file is None:
        xlib.Message.print(
            'error',
            '*** The SimHyb file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.simhyb_file):
        xlib.Message.print('error',
                           f'*** The file {args.simhyb_file} does not exist.')
        OK = False

    # check "header_row_number"
    if args.header_row_number is None:
        xlib.Message.print(
            'error',
            '*** The header row number in the SimHyb file is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.header_row_number, minimum=0):
        xlib.Message.print(
            'error',
            'The header row number in the SimHyb file has to be an integer number greater than or equalt to 0.'
        )
        OK = False
    else:
        args.header_row_number = int(args.header_row_number)

    # check "structure_file"
    if args.structure_file is None:
        xlib.Message.print(
            'error',
            '*** The converted Structure file is not indicated in the input arguments.'
        )
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#21
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "annotation_file_1"
    if args.annotation_file_1 is None:
        xlib.Message.print('error', '*** The first annotation file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.annotation_file_1):
        xlib.Message.print('error', f'*** The file {args.annotation_file_1} does not exist.')
        OK = False

    # check "type_1"
    if args.type_1 is None:
        xlib.Message.print('error', '*** The type of first annotation file is not indicated in the input arguments.')
        OK = False
    elif not xlib.check_code(args.type_1, xlib.get_type_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.')
        OK = False
    else:
        args.type_1 = args.type_1.upper()

    # check "annotation_file_2"
    if args.annotation_file_2 is None:
        xlib.Message.print('error', '*** The second annotation file is not indicated in the input arguments.')
        OK = False
    elif args.annotation_file_2.upper() == 'NONE':
        args.annotation_file_2 = args.annotation_file_2.upper()
    elif not os.path.isfile(args.annotation_file_2):
        xlib.Message.print('error', f'*** The file {args.annotation_file_2} does not exist.')
        OK = False

    # check "type_2"
    if args.type_2 is None:
        xlib.Message.print('error', '*** The format of second annotation file is not indicated in the input arguments.')
        OK = False
    elif args.type_2.upper() == 'NONE' and args.annotation_file_2 != 'NONE':
        xlib.Message.print('error', '*** The format of second annotation file has to be NONE if the second annotation file is NONE')
        OK = False
    elif args.type_2.upper() == 'NONE' and args.annotation_file_2 == 'NONE':
        args.type_2 = args.type_2.upper()
    elif not xlib.check_code(args.type_2, xlib.get_type_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.')
        OK = False
    else:
        args.type_2 = args.type_2.upper()

    # check "merger_file"
    if args.merger_file is None:
        xlib.Message.print('error', '*** The merged file is not indicated in the input arguments.')
        OK = False

    # check "merger_operation"
    if args.merger_operation is None:
        xlib.Message.print('error', '*** The merger operation is not indicated in the input arguments.')
        OK = False
    elif args.merger_operation.upper() == 'SAVE1' and args.annotation_file_2 != 'NONE':
        xlib.Message.print('error', '*** The merger operation SAVE1 is only valid when the second annotation file is NONE.')
        OK = False
    elif args.merger_operation.upper() != 'SAVE1' and not xlib.check_code(args.merger_operation, xlib.get_annotation_merger_operation_code_list(), case_sensitive=False) :
        xlib.Message.print('error', f'*** The merger operation has to be {xlib.get_annotation_merger_operation_code_list_text()}.')
        OK = False
    else:
        args.merger_operation = args.merger_operation.upper()

    # check "header"
    if args.header is None:
        args.header = xlib.Const.DEFAULT_HEADER
    elif not xlib.check_code(args.header, xlib.get_header_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** header has to be {xlib.get_header_code_list_text()}.')
        OK = False
    else:
        args.header = args.header.upper()

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('P001')
示例#22
0
def build_nexus_file(selection_loci_id_file_path, complete_loci_file_path,
                     selected_loci_file_path, nexus_file_path):
    '''
    Build a Nexus file from a ypirad loci file for a determinated loci set.
    '''

    # initialize the selected loci id list
    selected_loci_id_list = []

    # load the selected loci ids and set the selected loci id list
    try:
        with open(selection_loci_id_file_path) as selected_loci_ids_file_id:
            for record in selected_loci_ids_file_id:
                selected_loci_id_list.append(record[6:].rstrip())
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', selection_loci_id_file_path)
    xlib.Message.print('trace',
                       f'selected_loci_id_list: {selected_loci_id_list}\n')

    # open the complete loci file
    try:
        complete_loci_file_id = open(complete_loci_file_path,
                                     mode='r',
                                     encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', complete_loci_file_path)

    # open the selected loci file
    try:
        selected_loci_file_id = open(selected_loci_file_path,
                                     mode='w',
                                     encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', selected_loci_file_path)

    # set the pattern of the locus id records
    pattern1 = r'^\/\/(.*)\|(.*)\|$'

    # set the pattern of the locus information records
    pattern2 = r'^(.*) (.*)$'

    # initialize the list of locus information records
    locus_line_list = []

    # initialize the sequence locus lenght list
    seq_locus_lenght_list = []

    # initialize the taxon id list
    taxon_id_list = []

    # initialize the base count
    base_count = 0

    # read the first record of complete loci file
    record = complete_loci_file_id.readline()

    # while there are records
    while record != '':

        # process the locus id record
        if record.startswith('//'):

            # extract the locus id
            mo = re.search(pattern1, record)
            locus_id = mo.group(2)

            # when the locus id is a selected locus, write locus information in the selected loci file
            if locus_id in selected_loci_id_list:
                for i in range(len(locus_line_list)):

                    # extract the taxon id and sequence
                    mo = re.search(pattern2, locus_line_list[i])
                    taxon_id = mo.group(1).strip()
                    sequence = mo.group(2).strip()

                    # add the taxon id to taxon id list
                    if taxon_id not in taxon_id_list:
                        taxon_id_list.append(taxon_id)

                    # when the first taxon
                    if i == 0:

                        # add the sequence length to the base count
                        base_count += len(sequence)

                        # add the sequence length to the sequence locus lenght list to the first taxon found
                        seq_locus_lenght_list.append(len(sequence))

                    # write the line to the selected loci file
                    selected_loci_file_id.write(locus_line_list[i])

                # write the locus id record to the selected loci file
                selected_loci_file_id.write(record)

            # initialize the list of locus information records
            locus_line_list = []

        # process a locus information record
        else:

            # add the record to the list of locus information records
            locus_line_list.append(record)

        # read the next record of complete loci file
        record = complete_loci_file_id.readline()

    # sort the taxon id list
    taxon_id_list.sort()
    xlib.Message.print('trace', f'taxon_id_list: {taxon_id_list}\n')

    # close files
    complete_loci_file_id.close()
    selected_loci_file_id.close()

    # initialize the dictionary of locus information records
    locus_line_dict = {}

    # open the selected loci file
    try:
        selected_loci_file_id = open(selected_loci_file_path,
                                     mode='r',
                                     encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', selected_loci_file_path)

    # open the Nexus file
    try:
        nexus_file_id = open(nexus_file_path, mode='w', encoding='iso-8859-1')
    except Exception as e:
        raise xlib.ProgramException(e, 'F001', nexus_file_path)

    # write the head records in Nexus file
    nexus_file_id.write('#nexus\n')
    nexus_file_id.write('begin data;\n')
    nexus_file_id.write(
        f'  dimensions ntax={len(taxon_id_list)} nchar={base_count};\n')
    nexus_file_id.write('  format datatype=DNA interleave=yes gap=-;\n')
    nexus_file_id.write('  matrix\n')

    # read the first record of the selected loci file
    record = selected_loci_file_id.readline()

    # while there are records
    while record != '':

        # process the locus id record
        if record.startswith('//'):

            # get the sequence length of a taxon
            sequence_len = len(locus_line_dict[list(
                locus_line_dict.keys())[0]])

            # for each taxon, write its information
            for taxon_id in taxon_id_list:

                # get the taxon sequence
                sequence = locus_line_dict.get(taxon_id, 'N' * sequence_len)

                # write the locus information record in the Nexus file
                nexus_file_id.write(f'  {taxon_id:30} {sequence}\n')

            # write a blank line in the Nexus file
            nexus_file_id.write('\n')

            # initialize the dictionary of locus information records
            locus_line_dict = {}

        # process a locus information record
        else:

            # extract the taxon id and sequence
            mo = re.search(pattern2, record)
            taxon_id = mo.group(1).strip()
            sequence = mo.group(2).strip()

            # add the record to the dictionary of locus information records
            locus_line_dict[taxon_id] = sequence

        # read the next record of selected loci file
        record = selected_loci_file_id.readline()

    # write the tail records in Nexus file
    nexus_file_id.write('  ;\n')
    nexus_file_id.write('end;\n')
    nexus_file_id.write('begin assumptions;\n')
    start_position = 1
    for i in range(len(seq_locus_lenght_list)):
        end_position = start_position + seq_locus_lenght_list[i]
        nexus_file_id.write(
            f'  charset locus_{i + 1} = {start_position}-{end_position - 1};\n'
        )
        start_position = end_position
    nexus_file_id.write('end;\n')

    # close files
    selected_loci_file_id.close()
    nexus_file_id.close()
示例#23
0
def load_table_species(conn, species_file):
    '''
    '''
    
    # drop table "species" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "species" ...\n')
    xsqlite.drop_species(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')
    
    # create table "species"
    xlib.Message.print('verbose', 'Creating the table "species" ...\n')
    xsqlite.create_species(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the file of species data
    if species_file.endswith('.gz'):
        try:
            species_file_id = gzip.open(species_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', species_file)
    else:
        try:
            species_file_id = open(species_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', species_file)

    # set the pattern of the data records
    # format: "species_name";"plaza_id"
    record_pattern = re.compile(r'^"(.*)";"(.*)"$')

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # read the first record
    record = species_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to record counter
        record_counter += 1

        # process data records
        if not record.lstrip().startswith('#') and record.strip() != '':

            # initialize the row data dictionary
            row_dict = {}

            # extract data 
            try:
                mo = record_pattern.match(record)
                row_dict['species_name'] = mo.group(1).strip().capitalize()
                row_dict['plaza_species_id'] = mo.group(2).strip().lower()
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(species_file), record_counter)

            # get the taxonomy dictionary of the species name from taxonomy server
            taxonomy_dict = xlib.get_taxonomy_dict('name', row_dict['species_name'])
            if taxonomy_dict == {}:
                row_dict['family_name'] = xlib.get_na()
                row_dict['phylum_name'] = xlib.get_na()
                row_dict['kingdom_name'] = xlib.get_na()
                row_dict['superkingdom_name'] = xlib.get_na()
                row_dict['tax_id'] = xlib.get_na()
            else:
                row_dict['family_name'] = taxonomy_dict['family']['name']
                row_dict['phylum_name'] = taxonomy_dict['phylum']['name']
                row_dict['kingdom_name'] = taxonomy_dict['kingdom']['name']
                row_dict['superkingdom_name'] = taxonomy_dict['superkingdom']['name']
                row_dict['tax_id'] = taxonomy_dict['tax_id']

            # insert data into table species
            xsqlite.insert_species_row(conn, row_dict)
            inserted_row_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of species file: {record_counter} - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = species_file_id.readline()

    xlib.Message.print('verbose', '\n')
    
    # create the index on the table "species"
    xlib.Message.print('verbose', 'Creating the index on the table "species" ...\n')
    xsqlite.create_species_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close species file
    species_file_id.close()
示例#24
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "vcf_file"
    if args.vcf_file is None:
        xlib.Message.print(
            'error',
            '*** The VCF file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.vcf_file):
        xlib.Message.print('error',
                           f'*** The file {args.vcf_file} does not exist.')
        OK = False

    # check "sample_file"
    if args.sample_file is None:
        xlib.Message.print(
            'error',
            '*** The sample file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.sample_file):
        xlib.Message.print('error',
                           f'*** The file {args.sample_file} does not exist.')
        OK = False

    # check "sp1_id"
    if args.sp1_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the first species is not indicated in the input arguments.'
        )
        OK = False

    # check "sp2_id"
    if args.sp2_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the second species is not indicated in the input arguments.'
        )
        OK = False

    # check "hybrid_id"
    if args.hybrid_id is None:
        args.hybrid_id = 'NONE'

    # check "output_dir"
    if args.output_dir is None:
        xlib.Message.print(
            'error',
            '*** The output directy is not indicated in the input arguments.')
        OK = False
    elif not os.path.isdir(args.output_dir):
        xlib.Message.print('error', '*** The output directy does not exist.')
        OK = False

    # check "variant_number_per_file"
    if args.variant_number_per_file is None:
        args.variant_number_per_file = xlib.Const.DEFAULT_VARIANT_NUMBER_PER_FILE
    elif not xlib.check_int(args.variant_number_per_file, minimum=1):
        xlib.Message.print(
            'error',
            'The variant number per file has to be an integer number greater than 0.'
        )
        OK = False
    else:
        args.variant_number_per_file = int(args.variant_number_per_file)

    # check "allele_transformation"
    if args.allele_transformation is None:
        args.allele_transformation = 'NONE'
    elif not xlib.check_code(args.allele_transformation,
                             xlib.get_allele_transformation_code_list(),
                             case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.'
        )
        OK = False
    else:
        args.allele_transformation = args.allele_transformation.upper()

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check "tvi_list"
    if args.tvi_list is None or args.tvi_list == 'NONE':
        args.tvi_list = []
    else:
        args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list)

    # check the identification set
    if OK:
        if args.sp1_id == args.sp2_id or \
           args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id):
            xlib.Message.print('error',
                               'The identifications must be different.')
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#25
0
def load_table_kegg_ids(conn, kegg_id_file):
    '''
    '''

    # set the pattern of the data records
    # format: kegg_id\tthreshold\tscore_type\tprofile_type\tF-measure\tnseq\tnseq_used\talen\tmlen\teff_nseq\tre/pos\tdefinition
    record_pattern = re.compile(r'^(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)$')
    
    # drop table "kegg_ids"
    xlib.Message.print('verbose', 'Droping the table "kegg_ids" ...\n')
    xsqlite.drop_kegg_ids(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')
    
    # create table "kegg_ids"
    xlib.Message.print('verbose', 'Creating the table "kegg_ids" ...\n')
    xsqlite.create_kegg_ids(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # open the file of KEGG ids
    if kegg_id_file.endswith('.gz'):
        try:
            kegg_id_file_id = gzip.open(kegg_id_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F002', kegg_id_file)
    else:
        try:
            kegg_id_file_id = open(kegg_id_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException('F001', kegg_id_file)

    # initialize the record counter
    record_counter = 0

    # initialize the inserted row counter
    inserted_row_counter = 0

    # initialize the header record control
    header_record = True

    # read the first record
    record = kegg_id_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to the record counter
        record_counter += 1

        # process the header record
        if header_record:
            header_record = False

        # process data records
        else:

            # initialize the row data dictionary
            row_dict = {}

            # extract data
            try:
                mo = record_pattern.match(record)
                row_dict['kegg_id'] = mo.group(1).strip().lower()
                # definition format: description [EC:ec_id]
                definition = mo.group(12).strip()
                open_bracket_pos = definition.find('[')
                if open_bracket_pos > -1:
                    row_dict['desc'] = definition[:open_bracket_pos].strip()
                    row_dict['ec_id'] = definition[open_bracket_pos+4:-1].strip()
                else:
                    row_dict['desc'] = definition
                    row_dict['ec_id'] = 'N/A'
            except Exception as e:
                raise xlib.ProgramException('F006', os.path.basename(kegg_id_file), record_counter)

            # change quotation marks and semicolons in "desc"
            row_dict['desc'] = row_dict['desc'].replace("'", '|').replace(';', ',')

            # insert data into table "kegg_ids"
            xsqlite.insert_kegg_ids_row(conn, row_dict)
            inserted_row_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rProcessed records of KEGG ids file: {record_counter}  - Inserted rows: {inserted_row_counter}')

        # read the next record
        record = kegg_id_file_id.readline()

    xlib.Message.print('verbose', '\n')
    
    # create the index on the table "kegg_ids"
    xlib.Message.print('verbose', 'Creating the index on the table "kegg_ids" ...\n')
    xsqlite.create_kegg_ids_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into TOA database
    xlib.Message.print('verbose', 'Saving changes into TOA database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close kegg_ids file
    kegg_id_file_id.close()
示例#26
0
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id,
                           output_dir, variant_number_per_file,
                           allele_transformation, tvi_list):
    '''
    Filter and fixes variant data of a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample species and mother identification lists per variant
    species_id_list = []
    mother_id_list = []

    # initialize the maximum allele number per varaint
    maximum_allele_number = 0

    # initialize allele frequency dictionaries
    allele_frequency_dict_1 = {}
    allele_frequency_dict_2 = {}

    # initialize ATCG conversión dictionary
    # A -> 1; T -> 2; C -> 3; G -> 4
    atcg = 'ATCG'
    atcg_conversion_dict = {}

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException(e, 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                )
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build ATCG conversion list
            atcg_conversion_list = []
            index = atcg.find(reference_bases.upper())
            if index == -1:
                raise xlib.ProgramException('', 'L016')
            else:
                atcg_conversion_list.append(index + 1)
            for i in range(len(alternative_allele_list)):
                index = atcg.find(alternative_allele_list[i].upper())
                if index == -1:
                    raise xlib.ProgramException('', 'L016')
                else:
                    atcg_conversion_list.append(index + 1)
            atcg_conversion_dict[total_variant_counter] = atcg_conversion_list

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:])

            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'reference_bases: {reference_bases}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'sample_gt_list: {sample_gt_list}')

            # get the allele counters per species
            allele_counter_dict_1 = {}
            allele_counter_dict_2 = {}
            allele_counter_dict_h = {}
            for i in range(sample_number):
                # only when the sample is an adult
                if mother_id_list[i] == 'NONE':
                    if sample_gt_left_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_left_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_left_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_left_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_left_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_left_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_left_list[i], 0) + 1
                    if sample_gt_right_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_right_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_right_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_right_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_right_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_right_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_right_list[i], 0) + 1
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_1: {allele_counter_dict_1}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_2: {allele_counter_dict_2}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_h: {allele_counter_dict_h}')

            # calculate the maximum allele number
            if maximum_allele_number < len(allele_counter_dict_1.keys()):
                maximum_allele_number = len(allele_counter_dict_1.keys())
            if maximum_allele_number < len(allele_counter_dict_2.keys()):
                maximum_allele_number = len(allele_counter_dict_2.keys())

            # calculate the variant allele frecuencies per species
            allele_frequency_dict_1[total_variant_counter] = {}
            sp1_allele_total = 0
            for allele in allele_counter_dict_1.keys():
                sp1_allele_total += allele_counter_dict_1[allele]
            for allele in allele_counter_dict_1.keys():
                allele_frequency_dict_1[total_variant_counter][
                    allele] = allele_counter_dict_1[allele] / sp1_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}'
                    )
            allele_frequency_dict_2[total_variant_counter] = {}
            sp2_allele_total = 0
            for allele in allele_counter_dict_2.keys():
                sp2_allele_total += allele_counter_dict_2[allele]
            for allele in allele_counter_dict_2.keys():
                allele_frequency_dict_2[total_variant_counter][
                    allele] = allele_counter_dict_2[allele] / sp2_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}'
                    )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    vcf_file_id.close()

    # calculate the output SimHyb file number
    simhyb_file_num = math.ceil(total_variant_counter /
                                variant_number_per_file)

    # initialize the begin and end variant
    begin_variant = 1
    end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter

    # write the variant allele frecuencies per species in the output SimHyb files
    for i in range(simhyb_file_num):

        xlib.Message.print(
            'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format(
                begin_variant, end_variant))

        # set the SimHyb file name
        if vcf_file.endswith('.gz'):
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file[:-3]))
        else:
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file))
        if simhyb_file_num == 1:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv'
        else:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv'

        # open the output SimHyb file
        if current_simhyb_file.endswith('.gz'):
            try:
                current_simhyb_file_id = gzip.open(current_simhyb_file,
                                                   mode='wt',
                                                   encoding='iso-8859-1',
                                                   newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F004', current_simhyb_file)
        else:
            try:
                current_simhyb_file_id = open(current_simhyb_file,
                                              mode='w',
                                              encoding='iso-8859-1',
                                              newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F003', current_simhyb_file)

        # write allele frequency records
        for i in range(maximum_allele_number):

            xlib.Message.print('trace', f'i: {i}')

            # initialize the variable to control the record begin
            is_begin = True

            # species 1
            for j in range(begin_variant, end_variant + 1):

                xlib.Message.print('trace', f'j: {j}')

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_1.get(j, {})

                xlib.Message.print('trace',
                                   f'variant_data_dict: {variant_data_dict}')

                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the sample
                if is_begin:
                    record_part = f'{allele};{allele_frequency}'
                    is_begin = False
                else:
                    record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # species 2
            for j in range(begin_variant, end_variant + 1):

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_2.get(j, {})
                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the variant
                record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # write the end of the record
            current_simhyb_file_id.write('\n')

        # close SymHyb file
        current_simhyb_file_id.close()

        # print OK message
        xlib.Message.print(
            'info',
            f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.'
        )

        # set the new begin and end variant
        begin_variant = end_variant + 1
        end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
示例#27
0
def filter_transcripts(assembly_software_code, transcriptome_file, score_file,
                       output_file, minlen, maxlen, minFPKM, minTPM):
    '''
    Filter transcripts according to their length, FPKM and TPM.
    '''

    # initialize the transcripts dictionary
    transcripts_dict = {}

    # open the score file
    if score_file.endswith('.gz'):
        try:
            score_file_id = gzip.open(score_file,
                                      mode='rt',
                                      encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', score_file)
    else:
        try:
            score_file_id = open(score_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', score_file)

    # read the first record of score file and find out lenght, and FPKM and TMP positions
    score_record = score_file_id.readline()
    data_list = score_record.split('\t')
    transcript_id_position = -1
    length_position = -1
    FPKM_position = -1
    TPM_position = -1
    i = 0
    for datum in data_list:
        if datum.strip().upper().startswith('TRANSCRIPT_ID'):
            transcript_id_position = i
        if datum.strip().upper() == 'LENGTH':
            length_position = i
        elif datum.strip().upper() == 'FPKM':
            FPKM_position = i
        elif datum.strip().upper() == 'TPM':
            TPM_position = i
        i += 1
    if transcript_id_position == -1 or length_position == -1 or FPKM_position == -1 or TPM_position == -1:
        raise xlib.ProgramException('', 'L015')

    # while there are records in score file, save theirs transcript id, lenght, FPKM and TPM
    score_record = score_file_id.readline()
    while score_record != '':
        data_list = score_record.split('\t')
        transcript_id = data_list[transcript_id_position].upper()
        try:
            length = float(data_list[length_position])
            (integer_part, decimal_part) = divmod(length, 1)
            if decimal_part > 0:
                raise xlib.ProgramException('', 'D002',
                                            data_list[length_position],
                                            'length')
            else:
                length = int(integer_part)
        except Exception as e:
            raise xlib.ProgramException(e, 'D002', data_list[length_position],
                                        'length')
        try:
            FPKM = float(data_list[FPKM_position])
        except Exception as e:
            raise xlib.ProgramException(e, 'D003', data_list[FPKM_position],
                                        'FPKM')
        try:
            TPM = float(data_list[TPM_position])
        except Exception as e:
            raise xlib.ProgramException(e, data_list[TPM_position], 'TPM')
        transcripts_dict[transcript_id] = {
            'length': length,
            'FPKM': FPKM,
            'TPM': TPM
        }
        score_record = score_file_id.readline()

    # close score file
    score_file_id.close()

    # open the transcriptome file
    if transcriptome_file.endswith('.gz'):
        try:
            tanscriptome_file_id = gzip.open(transcriptome_file,
                                             mode='rt',
                                             encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', transcriptome_file)
    else:
        try:
            tanscriptome_file_id = open(transcriptome_file,
                                        mode='r',
                                        encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', transcriptome_file)

    # open the ouput file
    if output_file.endswith('.gz'):
        try:
            output_file_id = gzip.open(output_file,
                                       mode='wt',
                                       encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', output_file)
    else:
        try:
            output_file_id = open(output_file, mode='w', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', output_file)

    ## initialize the count of transcripts and saved transcripts
    transcripts_count = 0
    saved_transcripts_count = 0

    # set the pattern of the head records (>transcriptome_info)
    pattern = r'^>(.*)$'

    # read the first record of transcriptome file
    tanscriptome_record = tanscriptome_file_id.readline()

    # while there are records in transcriptome file
    while tanscriptome_record != '':

        # process the head record
        if tanscriptome_record.startswith('>'):

            # extract the data
            mo = re.search(pattern, tanscriptome_record)
            transcript_info = mo.group(1)

            # check the origin
            if assembly_software_code == xlib.Const.AS_TRINITY_CODE and transcript_info[:7].upper(
            ) != 'TRINITY':
                raise xlib.ProgramException('', 'F007', tanscriptome_record)

            # get the transcript id
            transcript_id = transcript_info.split(' ')[0].upper()

            # initialize the transcript sequence
            transcript_seq = ''

            # read the next record
            tanscriptome_record = tanscriptome_file_id.readline()

        else:

            # control the FASTA format
            raise xlib.ProgramException('', 'F006', transcriptome_file,
                                        'FASTA')

        # while there are records and they are sequence
        while tanscriptome_record != '' and not tanscriptome_record.startswith(
                '>'):

            # concatenate the record to the transcript sequence
            transcript_seq += tanscriptome_record.strip()

            # read the next record of transcriptome file
            tanscriptome_record = tanscriptome_file_id.readline()

        # add 1 to trascriptomes count
        transcripts_count += 1

        # write the transcriptome_record in the output built if its length is between the minimum and maximum length, and FPKM and TPM are greater or equal to arguments values
        length = transcripts_dict.get(transcript_id, {}).get('length', 0)
        FPKM = transcripts_dict.get(transcript_id, {}).get('FPKM', 0)
        TPM = transcripts_dict.get(transcript_id, {}).get('TPM', 0)
        if length >= minlen and length <= maxlen and FPKM >= minFPKM and TPM >= minTPM:
            try:
                output_file_id.write(f'>{transcript_info}\n')
                output_file_id.write(f'{transcript_seq}\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F001', output_file)
            # add 1 to save trascripts count
            saved_transcripts_count += 1

        # print the counters
        xlib.Message.print(
            'verbose',
            f'\rTranscripts processed ... {transcripts_count:9d} - Transcripts saved ... {saved_transcripts_count:9d}'
        )

    xlib.Message.print('verbose', '\n')

    # close transcriptome and output files
    tanscriptome_file_id.close()
    output_file_id.close()

    # print OK message
    print(
        f'\nThe file {os.path.basename(output_file)} containing the transcripts selected is created.'
    )
示例#28
0
def collapse_indels(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id,
                    hybrid_id, output_vcf_file, stats_file, tvi_list):
    '''
    Collapses the variant records corresponding to an indel in a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample, species and mother identification lists per variant
    sample_id_list = []
    species_id_list = []
    mother_id_list = []

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # open the imputed VCF file
    if output_vcf_file.endswith('.gz'):
        try:
            output_vcf_file_id = gzip.open(output_vcf_file,
                                           mode='wt',
                                           encoding='iso-8859-1',
                                           newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_vcf_file)
    else:
        try:
            output_vcf_file_id = open(output_vcf_file,
                                      mode='w',
                                      encoding='iso-8859-1',
                                      newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_vcf_file)

    # open the statistics file
    if stats_file.endswith('.gz'):
        try:
            stats_file_id = gzip.open(stats_file,
                                      mode='wt',
                                      encoding='iso-8859-1',
                                      newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', stats_file)
    else:
        try:
            stats_file_id = open(stats_file,
                                 mode='w',
                                 encoding='iso-8859-1',
                                 newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', stats_file)

    # write the statistics header
    stats_file_id.write('"seq_id";"position";"records";"length";"imputed"\n')

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    collapsed_variant_counter = 0
    created_indel_counter = 0

    # read the first record of input VCF file
    (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # write the metadata record
            output_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

            # read the next record of the input VCF file
            (record, _,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    sample_id = sample_dict[record_data_list[i]]['sample_id']
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                sample_id_list.append(sample_id)
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # write the column description record
            output_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

            # read the next record of the input VCF file
            (record, _,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            xlib.Message.print('trace', f'Iniciando...')

            # set the sequence identification and position control variables
            w_seq_id = data_dict['chrom']
            w_position = int(data_dict['pos'])

            # initialize the record counter of the "actual" variant
            actual_variant_record_counter = 0

            # initialize the reference bases (field REF)
            reference_bases = ''

            # initialize the found best sample list control variable
            found_best_sample_list = False

            # initialize the collapse control variable
            collapse = True

            # process variant records of same "actual" variant
            while record != '' and not record.startswith(
                    '##'
            ) and not record.startswith(
                    '#CHROM'
            ) and data_dict['chrom'] == w_seq_id and int(
                    data_dict['pos']
            ) == w_position + actual_variant_record_counter and collapse:

                xlib.Message.print('trace', f'Inside the loop')
                xlib.Message.print(
                    'trace',
                    f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
                )

                # add set the variant identification
                variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                    )

                # add 1 to the read sequence counter
                input_record_counter += 1

                # add 1 to the total variant counter
                total_variant_counter += 1

                # add 1 to the record counter of the "actual" variant
                actual_variant_record_counter += 1

                # get the position of the genotype (subfield GT) in the field FORMAT
                format_subfield_list = data_dict['format'].upper().split(':')
                try:
                    gt_position = format_subfield_list.index('GT')
                except Exception as e:
                    raise xlib.ProgramException(e, 'L007', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

                # build the list of sample genotypes of a variant
                sample_gt_list = []
                for i in range(sample_number):
                    sample_data_list = data_dict['sample_list'][i].split(':')
                    sample_gt_list.append(sample_data_list[gt_position])

                # build the lists of the left and right side of sample genotypes of a variant
                sample_gt_left_list = []
                sample_sep_list = []
                sample_gt_right_list = []
                for i in range(sample_number):
                    sep = '/'
                    sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        sep = '|'
                        sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        raise xlib.ProgramException('', 'L008', 'GT',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                    sample_sep_list.append(sep)
                    sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                  1:])
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       f'sample_gt_list: {sample_gt_list}')

                # initialize imputation control variable
                imputed_adult_count = 0

                # check
                for i in range(sample_number):

                    # only when the sample is adult
                    if mother_id_list[i] == 'NONE':

                        # check if there are imputed data
                        if sample_gt_left_list[
                                i] == imputed_md_id or sample_gt_right_list[
                                    i] == imputed_md_id:
                            imputed_adult_count += 1

                xlib.Message.print(
                    'trace',
                    f'variant_id: {variant_id} - imputed_adult_count: {imputed_adult_count}'
                )

                # concat the current reference bases to the new reference bases
                reference_bases = f'{reference_bases}{data_dict["ref"]}'

                # if there are not imputed adults
                if imputed_adult_count == 0:
                    id = data_dict['id']
                    alternative_alleles = data_dict['alt']
                    qual = data_dict['qual']
                    filter = data_dict['filter']
                    info = data_dict['info']
                    format = data_dict['format']
                    best_sample_list = data_dict['sample_list']
                    collapse = False

                # if there are imputed adults
                else:

                    if actual_variant_record_counter == 1:
                        id = data_dict['id']
                        alternative_alleles = data_dict['alt']
                        qual = data_dict['qual']
                        filter = data_dict['filter']
                        info = data_dict['info']
                        format = data_dict['format']
                        best_sample_list = data_dict['sample_list']
                        if alternative_alleles == xlib.get_md_symbol():
                            found_best_sample_list = True

                    elif not found_best_sample_list and data_dict[
                            'alt'] == xlib.get_md_symbol():
                        id = data_dict['id']
                        alternative_alleles = xlib.get_md_symbol()
                        qual = data_dict['qual']
                        filter = data_dict['filter']
                        info = data_dict['info']
                        format = data_dict['format']
                        best_sample_list = data_dict['sample_list']
                        found_best_sample_list = True

                # read the next record of the input VCF file
                xlib.Message.print('trace', f'Reading ...')
                (record, _,
                 data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                 sample_number)
                if record != '':
                    xlib.Message.print(
                        'trace',
                        f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
                    )

            # write the variant record
            xlib.Message.print('trace', f'Writing VCF ...')
            xlib.Message.print(
                'trace',
                f'w_seq_id: {w_seq_id} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
            )
            sample_list_text = '\t'.join(best_sample_list)
            output_vcf_file_id.write(
                f'{w_seq_id}\t{w_position}\t{id}\t{reference_bases}\t{alternative_alleles}\t{qual}\t{filter}\t{info}\t{format}\t{sample_list_text}\n'
            )

            # write the collapsing statistics  record
            xlib.Message.print('trace', f'Writing stats...')
            is_imputed = 'IMPUTED' if imputed_adult_count > 0 else '-'
            stats_file_id.write(
                f'{w_seq_id};{w_position};{actual_variant_record_counter};{len(reference_bases)};{is_imputed}\n'
            )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    output_vcf_file_id.close()
    stats_file_id.close()

    # print OK message
    xlib.Message.print(
        'info', f'The file {os.path.basename(output_vcf_file)} is created.')
示例#29
0
def check_args(args):
    '''
    Verity the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check loci_file_path
    if args.loci_file_path is None:
        xlib.Message.print(
            'error',
            '*** A loci file path is not indicated in the input arguments.')
        OK = False
    else:
        if not os.path.isfile(args.loci_file_path):
            xlib.Message.print(
                'error', f'*** The file {args.loci_file_path} does not exist.')
            OK = False
        if not args.loci_file_path.endswith('.loci'):
            xlib.Message.print(
                'error',
                f'*** The file {args.loci_file_path} does not end in ".loci".')
            OK = False

    # check stats_file_path
    if args.stats_file_path is None:
        xlib.Message.print(
            'error',
            '*** A statistics path is not indicated in the input arguments.')
        OK = False
    else:
        if not args.stats_file_path.endswith('.csv'):
            xlib.Message.print(
                'error',
                f'*** The file {args.stats_file_path} does not end in ".csv".')
            OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
示例#30
0
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list):
    '''
    Load data of a VCF file.
    '''

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # drop table "vcf_samples" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n')
    xsqlite.drop_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n')
    xsqlite.create_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # insert samples data into table "vcf_samples"
    xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n')
    for key, value in sample_dict.items():
        value['type'] = 'N/A'
        xsqlite.insert_vcf_samples_row(conn, value)
    xlib.Message.print('verbose', 'Data are inserted.\n')
     
    # create index "vcf_samples_index" with columns "dataset_id" and "gene_id"  (if not exists)
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n')
    xsqlite.create_vcf_samples_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # get the sample type dictionary
    sample_type_dict = xsqlite.get_sample_type_dict(conn)

    # update the type of each sample
    for key in sample_type_dict.keys():
        xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type'])

    # drop table "vcf_variants" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n')
    xsqlite.drop_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n')
    xsqlite.drop_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_samples_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n')
    xsqlite.drop_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles"
    vcf_variants_row_dict = {}
    vcf_alleles_row_dict = {}
    vcf_samples_alleles_row_dict = {}

    # build the list of imputed and missing data alleles
    M_I_list = [imputed_md_id, xlib.get_md_symbol()]

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    vcf_variants_inserted_row_counter = 0
    vcf_alleles_inserted_row_counter = 0
    vcf_samples_alleles_inserted_row_counter = 0

    # initialize the sample species and mother identification lists per variant
    sample_id_list = []
    species_id_list = []
    mother_id_list = []

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    sample_id = record_data_list[i]
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                sample_id_list.append(sample_id)
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith('##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build the alleles list from reference bases and alternative alleles list
            if alternative_alleles == xlib.get_md_symbol():
                alleles_list = [reference_bases]
            else:
                alleles_list = [reference_bases] + alternative_allele_list

            # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A
            variant_type = ''
            if alternative_alleles == xlib.get_md_symbol():
                variant_type = 'N/A'
            else:
                is_indel = False
                if len(reference_bases) > 1:
                    is_indel = True
                else:
                    for alternative_allele in alternative_allele_list:
                        if len(alternative_allele) > 1:
                            is_indel = True
                            break
                if is_indel:
                    variant_type = 'INDEL'
                elif len(alternative_allele_list) > 1:
                    variant_type = 'MULTIALLELIC'
                else:
                    variant_type = 'SNP'

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:])

            if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}')

            # set data and insert row into the table "vcf_variants"
            vcf_variants_row_dict['variant_id'] = variant_id
            vcf_variants_row_dict['seq_id'] = data_dict['chrom']
            vcf_variants_row_dict['position'] = data_dict['pos']
            vcf_variants_row_dict['reference_bases'] = reference_bases
            vcf_variants_row_dict['alternative_alleles'] = alternative_alleles
            vcf_variants_row_dict['variant_type'] = variant_type
            xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict)
            vcf_variants_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_alleles"
            vcf_alleles_row_dict['variant_id'] = variant_id
            # reference bases and alternative alleles
            for j in range(len(alleles_list)):
                vcf_alleles_row_dict['allele_id'] = str(j)
                vcf_alleles_row_dict['bases'] = alleles_list[j]
                if xlib.check_int(j) and allele_transformation == 'ADD100':
                    structure_allele_id = str(int(j) + 100)
                else:
                    structure_allele_id = j
                vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
                xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
                vcf_alleles_inserted_row_counter += 1
            # missing data
            vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(new_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(new_md_id) + 100)
            else:
                structure_allele_id = new_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1
            # imputed missing data
            vcf_alleles_row_dict['allele_id'] = imputed_md_id
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(imputed_md_id) + 100)
            else:
                structure_allele_id = imputed_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_samples_alleles"
            vcf_samples_alleles_row_dict['variant_id'] = variant_id
            for i in range(sample_number):
                vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i]

                # initialize genotype distribution dictionary
                genotype_distribution_dict = {}
                for j in range(len(alleles_list)):
                    genotype_distribution_dict[alleles_list[j]] = 0
                for j in range(len(M_I_list)):
                    genotype_distribution_dict[M_I_list[j]] = 0

                # calculate genotype distribution dictionary
                if sample_gt_left_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_left_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1
                if sample_gt_right_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_right_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1

                # calculate precuency and insert rows for reference bases and alternative alleles
                for j in range(len(alleles_list)):
                    if genotype_distribution_dict[alleles_list[j]] > 0:
                        # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j]
                        vcf_samples_alleles_row_dict['allele_id'] = j
                        vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2
                        xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                        vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for imputed missing data
                if genotype_distribution_dict[imputed_md_id] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for missing data
                if genotype_distribution_dict[xlib.get_md_symbol()] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')
     
    # create the index "vcf_variants_index" on the table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_alleles_index" on the table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into NGShelper database
    xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close the VCF file
    vcf_file_id.close()