Пример #1
0
def collapse_indels(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id,
                    hybrid_id, output_vcf_file, stats_file, tvi_list):
    '''
    Collapses the variant records corresponding to an indel in a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample, species and mother identification lists per variant
    sample_id_list = []
    species_id_list = []
    mother_id_list = []

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # open the imputed VCF file
    if output_vcf_file.endswith('.gz'):
        try:
            output_vcf_file_id = gzip.open(output_vcf_file,
                                           mode='wt',
                                           encoding='iso-8859-1',
                                           newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_vcf_file)
    else:
        try:
            output_vcf_file_id = open(output_vcf_file,
                                      mode='w',
                                      encoding='iso-8859-1',
                                      newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_vcf_file)

    # open the statistics file
    if stats_file.endswith('.gz'):
        try:
            stats_file_id = gzip.open(stats_file,
                                      mode='wt',
                                      encoding='iso-8859-1',
                                      newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', stats_file)
    else:
        try:
            stats_file_id = open(stats_file,
                                 mode='w',
                                 encoding='iso-8859-1',
                                 newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', stats_file)

    # write the statistics header
    stats_file_id.write('"seq_id";"position";"records";"length";"imputed"\n')

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    collapsed_variant_counter = 0
    created_indel_counter = 0

    # read the first record of input VCF file
    (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # write the metadata record
            output_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

            # read the next record of the input VCF file
            (record, _,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    sample_id = sample_dict[record_data_list[i]]['sample_id']
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                sample_id_list.append(sample_id)
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # write the column description record
            output_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

            # read the next record of the input VCF file
            (record, _,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            xlib.Message.print('trace', f'Iniciando...')

            # set the sequence identification and position control variables
            w_seq_id = data_dict['chrom']
            w_position = int(data_dict['pos'])

            # initialize the record counter of the "actual" variant
            actual_variant_record_counter = 0

            # initialize the reference bases (field REF)
            reference_bases = ''

            # initialize the found best sample list control variable
            found_best_sample_list = False

            # initialize the collapse control variable
            collapse = True

            # process variant records of same "actual" variant
            while record != '' and not record.startswith(
                    '##'
            ) and not record.startswith(
                    '#CHROM'
            ) and data_dict['chrom'] == w_seq_id and int(
                    data_dict['pos']
            ) == w_position + actual_variant_record_counter and collapse:

                xlib.Message.print('trace', f'Inside the loop')
                xlib.Message.print(
                    'trace',
                    f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
                )

                # add set the variant identification
                variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                    )

                # add 1 to the read sequence counter
                input_record_counter += 1

                # add 1 to the total variant counter
                total_variant_counter += 1

                # add 1 to the record counter of the "actual" variant
                actual_variant_record_counter += 1

                # get the position of the genotype (subfield GT) in the field FORMAT
                format_subfield_list = data_dict['format'].upper().split(':')
                try:
                    gt_position = format_subfield_list.index('GT')
                except Exception as e:
                    raise xlib.ProgramException(e, 'L007', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

                # build the list of sample genotypes of a variant
                sample_gt_list = []
                for i in range(sample_number):
                    sample_data_list = data_dict['sample_list'][i].split(':')
                    sample_gt_list.append(sample_data_list[gt_position])

                # build the lists of the left and right side of sample genotypes of a variant
                sample_gt_left_list = []
                sample_sep_list = []
                sample_gt_right_list = []
                for i in range(sample_number):
                    sep = '/'
                    sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        sep = '|'
                        sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        raise xlib.ProgramException('', 'L008', 'GT',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                    sample_sep_list.append(sep)
                    sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                  1:])
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       f'sample_gt_list: {sample_gt_list}')

                # initialize imputation control variable
                imputed_adult_count = 0

                # check
                for i in range(sample_number):

                    # only when the sample is adult
                    if mother_id_list[i] == 'NONE':

                        # check if there are imputed data
                        if sample_gt_left_list[
                                i] == imputed_md_id or sample_gt_right_list[
                                    i] == imputed_md_id:
                            imputed_adult_count += 1

                xlib.Message.print(
                    'trace',
                    f'variant_id: {variant_id} - imputed_adult_count: {imputed_adult_count}'
                )

                # concat the current reference bases to the new reference bases
                reference_bases = f'{reference_bases}{data_dict["ref"]}'

                # if there are not imputed adults
                if imputed_adult_count == 0:
                    id = data_dict['id']
                    alternative_alleles = data_dict['alt']
                    qual = data_dict['qual']
                    filter = data_dict['filter']
                    info = data_dict['info']
                    format = data_dict['format']
                    best_sample_list = data_dict['sample_list']
                    collapse = False

                # if there are imputed adults
                else:

                    if actual_variant_record_counter == 1:
                        id = data_dict['id']
                        alternative_alleles = data_dict['alt']
                        qual = data_dict['qual']
                        filter = data_dict['filter']
                        info = data_dict['info']
                        format = data_dict['format']
                        best_sample_list = data_dict['sample_list']
                        if alternative_alleles == xlib.get_md_symbol():
                            found_best_sample_list = True

                    elif not found_best_sample_list and data_dict[
                            'alt'] == xlib.get_md_symbol():
                        id = data_dict['id']
                        alternative_alleles = xlib.get_md_symbol()
                        qual = data_dict['qual']
                        filter = data_dict['filter']
                        info = data_dict['info']
                        format = data_dict['format']
                        best_sample_list = data_dict['sample_list']
                        found_best_sample_list = True

                # read the next record of the input VCF file
                xlib.Message.print('trace', f'Reading ...')
                (record, _,
                 data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                 sample_number)
                if record != '':
                    xlib.Message.print(
                        'trace',
                        f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
                    )

            # write the variant record
            xlib.Message.print('trace', f'Writing VCF ...')
            xlib.Message.print(
                'trace',
                f'w_seq_id: {w_seq_id} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}'
            )
            sample_list_text = '\t'.join(best_sample_list)
            output_vcf_file_id.write(
                f'{w_seq_id}\t{w_position}\t{id}\t{reference_bases}\t{alternative_alleles}\t{qual}\t{filter}\t{info}\t{format}\t{sample_list_text}\n'
            )

            # write the collapsing statistics  record
            xlib.Message.print('trace', f'Writing stats...')
            is_imputed = 'IMPUTED' if imputed_adult_count > 0 else '-'
            stats_file_id.write(
                f'{w_seq_id};{w_position};{actual_variant_record_counter};{len(reference_bases)};{is_imputed}\n'
            )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}'
            )

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    output_vcf_file_id.close()
    stats_file_id.close()

    # print OK message
    xlib.Message.print(
        'info', f'The file {os.path.basename(output_vcf_file)} is created.')
Пример #2
0
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list):
    '''
    Load data of a VCF file.
    '''

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # drop table "vcf_samples" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n')
    xsqlite.drop_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n')
    xsqlite.create_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # insert samples data into table "vcf_samples"
    xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n')
    for key, value in sample_dict.items():
        value['type'] = 'N/A'
        xsqlite.insert_vcf_samples_row(conn, value)
    xlib.Message.print('verbose', 'Data are inserted.\n')
     
    # create index "vcf_samples_index" with columns "dataset_id" and "gene_id"  (if not exists)
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n')
    xsqlite.create_vcf_samples_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # get the sample type dictionary
    sample_type_dict = xsqlite.get_sample_type_dict(conn)

    # update the type of each sample
    for key in sample_type_dict.keys():
        xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type'])

    # drop table "vcf_variants" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n')
    xsqlite.drop_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n')
    xsqlite.drop_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_samples_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n')
    xsqlite.drop_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles"
    vcf_variants_row_dict = {}
    vcf_alleles_row_dict = {}
    vcf_samples_alleles_row_dict = {}

    # build the list of imputed and missing data alleles
    M_I_list = [imputed_md_id, xlib.get_md_symbol()]

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    vcf_variants_inserted_row_counter = 0
    vcf_alleles_inserted_row_counter = 0
    vcf_samples_alleles_inserted_row_counter = 0

    # initialize the sample species and mother identification lists per variant
    sample_id_list = []
    species_id_list = []
    mother_id_list = []

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    sample_id = record_data_list[i]
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                sample_id_list.append(sample_id)
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith('##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build the alleles list from reference bases and alternative alleles list
            if alternative_alleles == xlib.get_md_symbol():
                alleles_list = [reference_bases]
            else:
                alleles_list = [reference_bases] + alternative_allele_list

            # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A
            variant_type = ''
            if alternative_alleles == xlib.get_md_symbol():
                variant_type = 'N/A'
            else:
                is_indel = False
                if len(reference_bases) > 1:
                    is_indel = True
                else:
                    for alternative_allele in alternative_allele_list:
                        if len(alternative_allele) > 1:
                            is_indel = True
                            break
                if is_indel:
                    variant_type = 'INDEL'
                elif len(alternative_allele_list) > 1:
                    variant_type = 'MULTIALLELIC'
                else:
                    variant_type = 'SNP'

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:])

            if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}')

            # set data and insert row into the table "vcf_variants"
            vcf_variants_row_dict['variant_id'] = variant_id
            vcf_variants_row_dict['seq_id'] = data_dict['chrom']
            vcf_variants_row_dict['position'] = data_dict['pos']
            vcf_variants_row_dict['reference_bases'] = reference_bases
            vcf_variants_row_dict['alternative_alleles'] = alternative_alleles
            vcf_variants_row_dict['variant_type'] = variant_type
            xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict)
            vcf_variants_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_alleles"
            vcf_alleles_row_dict['variant_id'] = variant_id
            # reference bases and alternative alleles
            for j in range(len(alleles_list)):
                vcf_alleles_row_dict['allele_id'] = str(j)
                vcf_alleles_row_dict['bases'] = alleles_list[j]
                if xlib.check_int(j) and allele_transformation == 'ADD100':
                    structure_allele_id = str(int(j) + 100)
                else:
                    structure_allele_id = j
                vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
                xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
                vcf_alleles_inserted_row_counter += 1
            # missing data
            vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(new_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(new_md_id) + 100)
            else:
                structure_allele_id = new_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1
            # imputed missing data
            vcf_alleles_row_dict['allele_id'] = imputed_md_id
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(imputed_md_id) + 100)
            else:
                structure_allele_id = imputed_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_samples_alleles"
            vcf_samples_alleles_row_dict['variant_id'] = variant_id
            for i in range(sample_number):
                vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i]

                # initialize genotype distribution dictionary
                genotype_distribution_dict = {}
                for j in range(len(alleles_list)):
                    genotype_distribution_dict[alleles_list[j]] = 0
                for j in range(len(M_I_list)):
                    genotype_distribution_dict[M_I_list[j]] = 0

                # calculate genotype distribution dictionary
                if sample_gt_left_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_left_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1
                if sample_gt_right_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_right_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1

                # calculate precuency and insert rows for reference bases and alternative alleles
                for j in range(len(alleles_list)):
                    if genotype_distribution_dict[alleles_list[j]] > 0:
                        # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j]
                        vcf_samples_alleles_row_dict['allele_id'] = j
                        vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2
                        xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                        vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for imputed missing data
                if genotype_distribution_dict[imputed_md_id] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for missing data
                if genotype_distribution_dict[xlib.get_md_symbol()] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')
     
    # create the index "vcf_variants_index" on the table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_alleles_index" on the table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into NGShelper database
    xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close the VCF file
    vcf_file_id.close()
Пример #3
0
def extract_vcf_genotypes(input_vcf_file, imputed_md_id, output_genotype_file, tvi_list):
    '''
    Extract genotype data of every variant from a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the maximum allele number
    maximum_allele_number = 0

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # set temporal genotype data file name
    if output_genotype_file.endswith('.gz'):
        tmp_genotype_file = f'{output_genotype_file[:-3]}.tmp.gz'
    else:
        tmp_genotype_file = f'{output_genotype_file}.tmp'

    # open the temporal genotype data file
    if tmp_genotype_file.endswith('.gz'):
        try:
            tmp_genotype_file_id = gzip.open(tmp_genotype_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', tmp_genotype_file)
    else:
        try:
            tmp_genotype_file_id = open(tmp_genotype_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', tmp_genotype_file)

    # write the header of the temporal genotype data file
    tmp_genotype_file_id.write('seq_id;position;ref;alt;genotype;counter\n')

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0

    # read the first record of input VCF file
    (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # set the sample number
            sample_number = len(record_data_list) - 9

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith('##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'
            if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}')

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']
            if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases} - alternative_alleles: {alternative_alleles}')

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')
            try:
                alternative_allele_list.remove(xlib.get_md_symbol())
            except:
                pass

            # set the maximum allele number
            if maximum_allele_number < 1 + len(alternative_allele_list):
                maximum_allele_number = 1 + len(alternative_allele_list)

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_sep_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos'])
                sample_sep_list.append(sep)
                if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol():
                    sample_gt_left_list.append(xlib.get_md_symbol())
                elif sample_gt_list[i][:sep_pos] == imputed_md_id:
                    sample_gt_left_list.append(99)
                else:
                    sample_gt_left_list.append(int(sample_gt_list[i][:sep_pos]))
                if sample_gt_list[i][sep_pos+1:] == xlib.get_md_symbol():
                    sample_gt_right_list.append(xlib.get_md_symbol())
                elif sample_gt_list[i][sep_pos+1:] == imputed_md_id:
                    sample_gt_right_list.append(99)
                else:
                    sample_gt_right_list.append(int(sample_gt_list[i][sep_pos+1:]))

            # initialize genotype counter dictionary
            genotype_counter_dict = {}
            for j in range(1 + len(alternative_allele_list) + 1):
                for k in range(j, 1 + len(alternative_allele_list) + 1):
                    if j != (1 + len(alternative_allele_list)) and k != (1 + len(alternative_allele_list)):
                        genotype_counter_dict[f'{j}/{k}'] = 0
                    elif j != (1 + len(alternative_allele_list)) and k == (1 + len(alternative_allele_list)):
                        genotype_counter_dict[f'{j}/99'] = 0
                    elif j == (1 + len(alternative_allele_list)) and k != (1 + len(alternative_allele_list)):
                        genotype_counter_dict[f'99/{k}'] = 0
                    elif j == (1 + len(alternative_allele_list)) and k == (1 + len(alternative_allele_list)):
                        genotype_counter_dict['99/99'] = 0
            if variant_id in tvi_list: xlib.Message.print('trace', f'genotype_counter_dict: {genotype_counter_dict}')

            # initialize missing data counter
            md_counter = 0

            # count genotypes
            for i in range(sample_number):
                if sample_gt_left_list[i] == xlib.get_md_symbol() or sample_gt_right_list == xlib.get_md_symbol():
                    md_counter += 1
                else:
                    if sample_gt_left_list[i] <= sample_gt_right_list[i]:
                        j = sample_gt_left_list[i]
                        k = sample_gt_right_list[i]
                    else:
                        j = sample_gt_right_list[i]
                        k = sample_gt_left_list[i]
                    genotype_counter_dict[f'{j}/{k}'] = genotype_counter_dict[f'{j}/{k}'] + 1
            if variant_id in tvi_list: xlib.Message.print('trace', f'genotype_counter_dict: {genotype_counter_dict}')

            # write the variant gewnotype count records
            for key in genotype_counter_dict.keys():
                tmp_genotype_file_id.write(f'{data_dict["chrom"]};{data_dict["pos"]};{reference_bases};{alternative_alleles};{key};{genotype_counter_dict[key]}\n')
            tmp_genotype_file_id.write(f'{data_dict["chrom"]};{data_dict["pos"]};{reference_bases};{alternative_alleles};{xlib.get_md_symbol()};{md_counter}\n')

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    tmp_genotype_file_id.close()

    # print OK message 
    xlib.Message.print('info', f'The file {os.path.basename(tmp_genotype_file)} is created.')

    # open the temporal genotype data file
    if tmp_genotype_file.endswith('.gz'):
        try:
            tmp_genotype_file_id = gzip.open(tmp_genotype_file, mode='rt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', tmp_genotype_file)
    else:
        try:
            tmp_genotype_file_id = open(tmp_genotype_file, mode='r', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', tmp_genotype_file)

    # open the genotype data file
    if output_genotype_file.endswith('.gz'):
        try:
            output_genotype_file_id = gzip.open(output_genotype_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_genotype_file)
    else:
        try:
            output_genotype_file_id = open(output_genotype_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_genotype_file)

    # initialize record counters
    input_record_counter = 0

    # write the header of the genotype data file
    maximum_variant_list = []
    for j in range(maximum_allele_number + 1):
        for k in range(j, maximum_allele_number + 1):
            if j != (maximum_allele_number) and k != (maximum_allele_number):
                maximum_variant_list.append(f'{j}/{k}')
            elif j != (maximum_allele_number) and k == (maximum_allele_number):
                maximum_variant_list.append(f'{j}/99')
            elif j == (maximum_allele_number) and k != (maximum_allele_number):
                maximum_variant_list.append(f'99/{k}')
            elif j == (maximum_allele_number) and k == (maximum_allele_number):
                maximum_variant_list.append('99/99')
    maximum_variant_list.append('.')
    if variant_id in tvi_list: xlib.Message.print('trace', f'maximum_variant_list: {maximum_variant_list}')
    output_genotype_file_id.write('seq_id;position;ref;alt;{0}\n'.format(';'.join(maximum_variant_list)))

    # read the first record of the temporal genotype data file
    (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter)

    # set the first record control variable
    first_record = True

    # while there are records in the temporal genotype data file
    while record != '':

        # the header record
        if first_record:

            # set the first record control variable
            first_record = False

            # read the next record of the temporal genotype data file
            (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter)

        # data records
        else:

            # save old values
            old_seq_id = data_dict['seq_id']
            old_position = data_dict['position']
            old_ref = data_dict['ref']
            old_alt = data_dict['alt']

            # initialize genotype counter dictionary
            genotype_counter_dict = {}
            for j in range(maximum_allele_number + 1):
                for k in range(j, maximum_allele_number + 1):
                    if j != maximum_allele_number and k != maximum_allele_number:
                        genotype_counter_dict[f'{j}/{k}'] = 0
                    elif j != maximum_allele_number and k == maximum_allele_number:
                        genotype_counter_dict[f'{j}/99'] = 0
                    elif j == maximum_allele_number and k != maximum_allele_number:
                        genotype_counter_dict[f'99/{k}'] = 0
                    elif j == maximum_allele_number and k == maximum_allele_number:
                        genotype_counter_dict['99/99'] = 0
            genotype_counter_dict['.'] = 0
            if f'{old_seq_id}-{old_position}' in tvi_list: xlib.Message.print('trace', f'***genotype_counter_dict: {genotype_counter_dict}')

            # while there are records in the temporal genotype data file and the same variant
            while record != '' and data_dict['seq_id'] == old_seq_id and data_dict['position'] == old_position:

                # save the genotype counter in the genotype counter dictionary
                genotype_counter_dict[data_dict['genotype']] = data_dict['counter']

                # read the next record of the temporal genotype data file
                (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter)

            if f'{old_seq_id}-{old_position}' in tvi_list: xlib.Message.print('trace', f'***genotype_counter_dict: {genotype_counter_dict}')

            # write the variant gewnotype count records
            genotype_counter_list = []
            if sample_number - int(genotype_counter_dict['.']) > 0:
                for j in range(maximum_allele_number + 1):
                    for k in range(j, maximum_allele_number + 1):
                        if j != (maximum_allele_number) and k != (maximum_allele_number):
                            genotype_counter_list.append(str(int(genotype_counter_dict[f'{j}/{k}'])/(sample_number - int(genotype_counter_dict['.']))))
                        elif j != (maximum_allele_number) and k == (maximum_allele_number):
                            genotype_counter_list.append(str(int(genotype_counter_dict[f'{j}/99'])/(sample_number - int(genotype_counter_dict['.']))))
                        elif j == (maximum_allele_number) and k != (maximum_allele_number):
                            genotype_counter_list.append(str(int(genotype_counter_dict[f'99/{k}'])/(sample_number - int(genotype_counter_dict['.']))))
                        elif j == (maximum_allele_number) and k == (maximum_allele_number):
                            genotype_counter_list.append(str(int(genotype_counter_dict['99/99'])/(sample_number - int(genotype_counter_dict['.']))))
            genotype_counter_list.append(genotype_counter_dict['.'])
            genotype_counter_list_text = ';'.join(genotype_counter_list)
            output_genotype_file_id.write(f'{old_seq_id};{old_position};{old_ref};{old_alt};{genotype_counter_list_text}\n')

    # close files
    tmp_genotype_file_id.close()
    output_genotype_file_id.close()

    # print OK message 
    xlib.Message.print('info', f'The file {os.path.basename(output_genotype_file)} is created.')
Пример #4
0
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id,
                           output_dir, variant_number_per_file,
                           allele_transformation, tvi_list):
    '''
    Filter and fixes variant data of a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample species and mother identification lists per variant
    species_id_list = []
    mother_id_list = []

    # initialize the maximum allele number per varaint
    maximum_allele_number = 0

    # initialize allele frequency dictionaries
    allele_frequency_dict_1 = {}
    allele_frequency_dict_2 = {}

    # initialize ATCG conversión dictionary
    # A -> 1; T -> 2; C -> 3; G -> 4
    atcg = 'ATCG'
    atcg_conversion_dict = {}

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException(e, 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                )
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build ATCG conversion list
            atcg_conversion_list = []
            index = atcg.find(reference_bases.upper())
            if index == -1:
                raise xlib.ProgramException('', 'L016')
            else:
                atcg_conversion_list.append(index + 1)
            for i in range(len(alternative_allele_list)):
                index = atcg.find(alternative_allele_list[i].upper())
                if index == -1:
                    raise xlib.ProgramException('', 'L016')
                else:
                    atcg_conversion_list.append(index + 1)
            atcg_conversion_dict[total_variant_counter] = atcg_conversion_list

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:])

            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'reference_bases: {reference_bases}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'sample_gt_list: {sample_gt_list}')

            # get the allele counters per species
            allele_counter_dict_1 = {}
            allele_counter_dict_2 = {}
            allele_counter_dict_h = {}
            for i in range(sample_number):
                # only when the sample is an adult
                if mother_id_list[i] == 'NONE':
                    if sample_gt_left_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_left_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_left_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_left_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_left_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_left_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_left_list[i], 0) + 1
                    if sample_gt_right_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_right_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_right_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_right_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_right_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_right_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_right_list[i], 0) + 1
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_1: {allele_counter_dict_1}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_2: {allele_counter_dict_2}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_h: {allele_counter_dict_h}')

            # calculate the maximum allele number
            if maximum_allele_number < len(allele_counter_dict_1.keys()):
                maximum_allele_number = len(allele_counter_dict_1.keys())
            if maximum_allele_number < len(allele_counter_dict_2.keys()):
                maximum_allele_number = len(allele_counter_dict_2.keys())

            # calculate the variant allele frecuencies per species
            allele_frequency_dict_1[total_variant_counter] = {}
            sp1_allele_total = 0
            for allele in allele_counter_dict_1.keys():
                sp1_allele_total += allele_counter_dict_1[allele]
            for allele in allele_counter_dict_1.keys():
                allele_frequency_dict_1[total_variant_counter][
                    allele] = allele_counter_dict_1[allele] / sp1_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}'
                    )
            allele_frequency_dict_2[total_variant_counter] = {}
            sp2_allele_total = 0
            for allele in allele_counter_dict_2.keys():
                sp2_allele_total += allele_counter_dict_2[allele]
            for allele in allele_counter_dict_2.keys():
                allele_frequency_dict_2[total_variant_counter][
                    allele] = allele_counter_dict_2[allele] / sp2_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}'
                    )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    vcf_file_id.close()

    # calculate the output SimHyb file number
    simhyb_file_num = math.ceil(total_variant_counter /
                                variant_number_per_file)

    # initialize the begin and end variant
    begin_variant = 1
    end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter

    # write the variant allele frecuencies per species in the output SimHyb files
    for i in range(simhyb_file_num):

        xlib.Message.print(
            'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format(
                begin_variant, end_variant))

        # set the SimHyb file name
        if vcf_file.endswith('.gz'):
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file[:-3]))
        else:
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file))
        if simhyb_file_num == 1:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv'
        else:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv'

        # open the output SimHyb file
        if current_simhyb_file.endswith('.gz'):
            try:
                current_simhyb_file_id = gzip.open(current_simhyb_file,
                                                   mode='wt',
                                                   encoding='iso-8859-1',
                                                   newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F004', current_simhyb_file)
        else:
            try:
                current_simhyb_file_id = open(current_simhyb_file,
                                              mode='w',
                                              encoding='iso-8859-1',
                                              newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F003', current_simhyb_file)

        # write allele frequency records
        for i in range(maximum_allele_number):

            xlib.Message.print('trace', f'i: {i}')

            # initialize the variable to control the record begin
            is_begin = True

            # species 1
            for j in range(begin_variant, end_variant + 1):

                xlib.Message.print('trace', f'j: {j}')

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_1.get(j, {})

                xlib.Message.print('trace',
                                   f'variant_data_dict: {variant_data_dict}')

                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the sample
                if is_begin:
                    record_part = f'{allele};{allele_frequency}'
                    is_begin = False
                else:
                    record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # species 2
            for j in range(begin_variant, end_variant + 1):

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_2.get(j, {})
                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the variant
                record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # write the end of the record
            current_simhyb_file_id.write('\n')

        # close SymHyb file
        current_simhyb_file_id.close()

        # print OK message
        xlib.Message.print(
            'info',
            f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.'
        )

        # set the new begin and end variant
        begin_variant = end_variant + 1
        end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
Пример #5
0
def convert_vcf_to_phase_input(vcf_file, sample_file, sp1_id, sp2_id,
                               hybrid_id, imputed_md_id, allele_transformation,
                               output_dir, tvi_list):
    '''
    Convert a VCF file to the PHASE input format.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the sample information list
    sample_info_list = []

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample species identification list per variant
    species_id_list = []

    # open the VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # initialize counters
    seq_counter = 0
    variant_counter = 0
    record_counter = 0

    # read the first record of VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample information list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                if species_id == sp1_id:
                    numeric_species_id = 1
                elif species_id == sp2_id:
                    numeric_species_id = 2
                else:
                    numeric_species_id = 3
                sample_info_list.append(
                    [record_data_list[i], numeric_species_id])

            # build the sample species list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant records
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the sequence counter
            seq_counter += 1

            # initialize VCF record counter
            variant_counter = 0

            # save the sequence
            old_seq = data_dict['chrom']

            # initialize the list of variant positions
            variant_position_list = []

            # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes
            gt_left_matrix = []
            gt_right_matrix = []

            # initialize the list of the variant multiallelic status
            variant_multiallelic_status_list = []

            # process variant records of the same sequence
            while record != '' and not record.startswith(
                    '##') and not record.startswith(
                        '#CHROM') and data_dict['chrom'] == old_seq:

                # add 1 to the VCF record counter
                record_counter += 1

                # add 1 to the total variant counter
                variant_counter += 1

                # append position to the list of variant positions
                variant_position_list.append(data_dict['pos'])

                # get the position of the genotype (subfield GT) in the field FORMAT
                format_subfield_list = data_dict['format'].upper().split(':')
                try:
                    gt_position = format_subfield_list.index('GT')
                except Exception as e:
                    raise xlib.ProgramException(e, 'L007', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

                # build the list of sample genotypes of a variant
                sample_gt_list = []
                for i in range(sample_number):
                    sample_data_list = data_dict['sample_list'][i].split(':')
                    sample_gt_list.append(sample_data_list[gt_position])
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       f'sample_gt_list: {sample_gt_list}')

                # build the lists of the left and right side of sample genotypes of a variant
                sample_gt_left_list = []
                sample_gt_right_list = []
                for i in range(sample_number):
                    sep = '/'
                    sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        sep = '|'
                        sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        raise xlib.ProgramException('', 'L008', 'GT',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                    sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                  1:])

                # get the allele counters per species
                allele_counter_dict = {}
                for i in range(sample_number):
                    if sample_gt_left_list[i] != xlib.get_md_symbol():
                        allele_counter_dict[
                            sample_gt_left_list[i]] = allele_counter_dict.get(
                                sample_gt_left_list[i], 0) + 1
                    if sample_gt_right_list[i] != xlib.get_md_symbol():
                        allele_counter_dict[
                            sample_gt_right_list[i]] = allele_counter_dict.get(
                                sample_gt_right_list[i], 0) + 1
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace', f'allele_counter_dict: {allele_counter_dict}')

                # check if the variant is multiallelic
                if len(allele_counter_dict.keys()) > 2:
                    variant_multiallelic_status = 'M'
                else:
                    variant_multiallelic_status = 'S'
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'variant_multiallelic_status: {variant_multiallelic_status}.'
                    )

                # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes
                gt_left_matrix.append(sample_gt_left_list)
                gt_right_matrix.append(sample_gt_right_list)

                # append to the list of the variant multiallelic status
                variant_multiallelic_status_list.append(
                    variant_multiallelic_status)

                # print the counters
                xlib.Message.print(
                    'verbose',
                    f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
                )

                # read the next record of the VCF file
                (record, key,
                 data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

            # set output converted file of the sequence
            if vcf_file.endswith('.gz'):
                file_name, file_extension = os.path.splitext(
                    os.path.basename(vcf_file[:-3]))
            else:
                file_name, file_extension = os.path.splitext(
                    os.path.basename(vcf_file))
            seq_output_converted_file = f'{output_dir}/{file_name}-2phase-{old_seq}.txt'

            # open the output converted file
            if seq_output_converted_file.endswith('.gz'):
                try:
                    seq_output_converted_file_id = gzip.open(
                        seq_output_converted_file,
                        mode='wt',
                        encoding='iso-8859-1',
                        newline='\n')
                except Exception as e:
                    raise xlib.ProgramException(e, 'F004',
                                                seq_output_converted_file)
            else:
                try:
                    seq_output_converted_file_id = open(
                        seq_output_converted_file,
                        mode='w',
                        encoding='iso-8859-1',
                        newline='\n')
                except Exception as e:
                    raise xlib.ProgramException(e, 'F003',
                                                seq_output_converted_file)

            # write header records
            header_record_1 = f'{sample_number}\n'
            seq_output_converted_file_id.write(header_record_1)
            header_record_2 = f'{len(variant_position_list)}\n'
            seq_output_converted_file_id.write(header_record_2)
            header_record_3 = f'P {" ".join(variant_position_list)}\n'
            seq_output_converted_file_id.write(header_record_3)
            header_record_4 = f'{"".join(variant_multiallelic_status_list)}\n'
            seq_output_converted_file_id.write(header_record_4)

            # write sample records
            for i in range(sample_number):

                # build left and right side lists of variants of a sample
                sample_variant_gt_left_list = []
                sample_variant_gt_right_list = []
                for j in range(len(variant_position_list)):
                    # left
                    if gt_left_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'S':
                        allele_left = '?'
                    elif gt_left_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'M':
                        allele_left = '-1'
                    elif xlib.check_int(
                            gt_left_matrix[j]
                        [i]) and allele_transformation == 'ADD100':
                        allele_left = str(int(gt_left_matrix[j][i]) + 100)
                    else:
                        allele_left = gt_left_matrix[j][i]
                    sample_variant_gt_left_list.append(allele_left)
                    # right
                    if gt_right_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'S':
                        allele_right = '?'
                    elif gt_right_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'M':
                        allele_right = '-1'
                    elif xlib.check_int(
                            gt_right_matrix[j]
                        [i]) and allele_transformation == 'ADD100':
                        allele_right = str(int(gt_right_matrix[j][i]) + 100)
                    else:
                        allele_right = gt_right_matrix[j][i]
                    sample_variant_gt_right_list.append(allele_right)

                # write the first record of the sample
                sample_record_1 = f'#{sample_info_list[i][0]}\n'
                seq_output_converted_file_id.write(sample_record_1)

                # write the second record of the sample
                sample_record_2 = f'{" ".join(sample_variant_gt_left_list)}\n'
                seq_output_converted_file_id.write(sample_record_2)

                # write the third record of the sample
                sample_record_3 = f'{" ".join(sample_variant_gt_right_list)}\n'
                seq_output_converted_file_id.write(sample_record_3)

            # close file
            seq_output_converted_file_id.close()

            xlib.Message.print('verbose', '\n')

            # print OK message
            xlib.Message.print(
                'info',
                f'The converted file {os.path.basename(seq_output_converted_file)} is created.'
            )

    # close VCF file
    vcf_file_id.close()
Пример #6
0
def convert_vcf_to_structure(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id,
                             imputed_md_id, new_md_id, allele_transformation,
                             structure_file_type, output_converted_file,
                             tvi_list):
    '''
    Convert a VCF file to the Structure input formats.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the sample information list
    sample_info_list = []

    # initialize the variant code list
    variant_code_list = []

    # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes
    gt_left_matrix = []
    gt_right_matrix = []

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # open the VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # initialize counters
    record_counter = 0
    variant_counter = 0

    # read the first record of VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample information list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                if species_id == sp1_id:
                    numeric_species_id = 1
                elif species_id == sp2_id:
                    numeric_species_id = 2
                else:
                    numeric_species_id = 3
                sample_info_list.append(
                    [record_data_list[i], numeric_species_id])

            # check if the sample information list is empty
            if sample_info_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(sample_info_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant records
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the VCF record counter
            record_counter += 1

            # add 1 to the variant counter
            variant_counter += 1

            # append variant code to the variant code list and write the code and its sequence identification and position in the variant file
            id = f'{data_dict["chrom"]}-{data_dict["pos"]}'
            variant_code_list.append(id)

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(4) sample_gt_list: {sample_gt_list}')

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                try:
                    if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol():
                        sample_gt_left_list.append(new_md_id)
                    else:
                        sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    if sample_gt_list[i][sep_pos + 1:] == xlib.get_md_symbol():
                        sample_gt_right_list.append(new_md_id)
                    else:
                        sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                      1:])
                except Exception as e:
                    raise xlib.ProgramException(e, 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

            # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes
            gt_left_matrix.append(sample_gt_left_list)
            gt_right_matrix.append(sample_gt_right_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    vcf_file_id.close()

    # review the imputed missing data when the type of the converted file is 1
    if structure_file_type == '1':

        # detect variants with any imputed missing data
        excluded_variant_index_list = []
        for i in range(len(gt_left_matrix)):
            for j in range(sample_number):
                if gt_left_matrix[i][j] == imputed_md_id or gt_right_matrix[i][
                        j] == imputed_md_id:
                    excluded_variant_index_list.append(i)
                    break
        xlib.Message.print(
            'trace', 'excluded_variant_index_list: {}'.format(
                excluded_variant_index_list))

        # remove data of variants with any imputed missing data
        excluded_variant_index_list.reverse()
        for k in excluded_variant_index_list:
            variant_code_list.pop(k)
            gt_left_matrix.pop(k)
            gt_right_matrix.pop(k)

    # open the output converted file
    if output_converted_file.endswith('.gz'):
        try:
            output_converted_file_id = gzip.open(output_converted_file,
                                                 mode='wt',
                                                 encoding='iso-8859-1',
                                                 newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_converted_file)
    else:
        try:
            output_converted_file_id = open(output_converted_file,
                                            mode='w',
                                            encoding='iso-8859-1',
                                            newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_converted_file)

    # write header record
    variant_code_list_text = '\t'.join(variant_code_list)
    output_converted_file_id.write(
        f'sample_id\tspecies_id\t{variant_code_list_text}\n')

    # write sample records
    for i in range(sample_number):

        # build left and right side lists of variants of a sample
        sample_variant_gt_left_list = []
        sample_variant_gt_right_list = []
        for j in range(len(gt_left_matrix)):
            # left
            if xlib.check_int(gt_left_matrix[j]
                              [i]) and allele_transformation == 'ADD100':
                allele_left = str(int(gt_left_matrix[j][i]) + 100)
            else:
                allele_left = gt_left_matrix[j][i]
            sample_variant_gt_left_list.append(allele_left)
            # right
            if xlib.check_int(gt_right_matrix[j]
                              [i]) and allele_transformation == 'ADD100':
                allele_right = str(int(gt_right_matrix[j][i]) + 100)
            else:
                allele_right = gt_right_matrix[j][i]
            sample_variant_gt_right_list.append(allele_right)

        # write the first record of the sample
        sample_variant_gt_left_list_text = '\t'.join(
            sample_variant_gt_left_list)
        output_converted_file_id.write(
            f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_left_list_text}\n'
        )
        # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_left_list)}\n')

        # write the second record of the sample
        sample_variant_gt_right_list_text = '\t'.join(
            sample_variant_gt_right_list)
        output_converted_file_id.write(
            f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_right_list_text}\n'
        )
        # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_right_list)}\n')

    # close file
    output_converted_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The converted file {os.path.basename(output_converted_file)} is created.'
    )
Пример #7
0
def filter_variant(input_vcf_file, value, output_purged_file):
    '''
    Filter variants containing a determined value in left or right sides of sample genotypes in a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the non-filtered sequence identification list
    non_filtered_seq_id_list = []

    # set the temporal VCF file
    temporal_vcf_file = f'{output_purged_file}.tmp'

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # open the temporal VCF file
    if temporal_vcf_file.endswith('.gz'):
        try:
            temporal_vcf_file_id = gzip.open(temporal_vcf_file,
                                             mode='wt',
                                             encoding='iso-8859-1',
                                             newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', temporal_vcf_file)
    else:
        try:
            temporal_vcf_file_id = open(temporal_vcf_file,
                                        mode='w',
                                        encoding='iso-8859-1',
                                        newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', temporal_vcf_file)

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    filtered_variant_counter = 0

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                  sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # write the metadata record
            temporal_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # set the sample number
            sample_number = len(record_data_list) - 9

            # write the column description record
            temporal_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_sep_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_sep_list.append(sep)
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:])

            # initialize the control variable to write the variant
            write_the_variant = True

            # detect value in left or right sides of sample genotypes
            for i in range(sample_number):
                if sample_gt_left_list[i] == value or sample_gt_right_list[
                        i] == value:
                    write_the_variant = False
                    break

            # if the process has to write the variant
            if write_the_variant:

                # rebuild the list of the field GT for every sample
                for i in range(sample_number):
                    sample_gt_list[
                        i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}'

                # rebuild the alternative alleles and its corresponding record data
                alternative_alleles = ','.join(alternative_allele_list)

                # rebuild the sample genotype data list and their corresponding record data
                sample_list = []
                for i in range(sample_number):
                    sample_data_list[gt_position] = sample_gt_list[i]
                    sample_list.append(':'.join(sample_data_list))

                # add the sequence identification to the non filtered sequence identification list
                if data_dict['chrom'] not in non_filtered_seq_id_list:
                    non_filtered_seq_id_list.append(data_dict['chrom'])

                # write the variant record
                sample_list_text = '\t'.join(sample_list)
                temporal_vcf_file_id.write(
                    f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n'
                )

            # if the process does not have to write the variant
            else:

                # add 1 to the filtered variant counter
                filtered_variant_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    temporal_vcf_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The temporal file {os.path.basename(temporal_vcf_file)} containing the filtered variants is created.'
    )
    xlib.Message.print('info', 'Removing metadata of filtered variants ...')

    # open the temporal VCF file
    if temporal_vcf_file.endswith('.gz'):
        try:
            temporal_vcf_file_id = gzip.open(temporal_vcf_file,
                                             mode='rt',
                                             encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', temporal_vcf_file)
    else:
        try:
            temporal_vcf_file_id = open(temporal_vcf_file,
                                        mode='r',
                                        encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', temporal_vcf_file)

    # open the output purged file
    if output_purged_file.endswith('.gz'):
        try:
            output_purged_file_id = gzip.open(output_purged_file,
                                              mode='wt',
                                              encoding='iso-8859-1',
                                              newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_purged_file)
    else:
        try:
            output_purged_file_id = open(output_purged_file,
                                         mode='w',
                                         encoding='iso-8859-1',
                                         newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_purged_file)

    # read the first record of temporal VCF file
    record = temporal_vcf_file_id.readline()

    # while there are records in temporal VCF file
    while record != '':

        # process contig records
        if record.startswith('##contig'):

            # get the sequence identification and the position
            seq_id = ''
            i1 = 13
            i2 = record.find(',', i1)
            if i2 > -1:
                seq_id = record[i1:i2]

            # write the record when the sequence identification was not filtered
            if seq_id in non_filtered_seq_id_list:
                output_purged_file_id.write(record)

        # process other records
        else:

            # write record
            output_purged_file_id.write(record)

        # read the next record
        record = temporal_vcf_file_id.readline()

    # close files
    temporal_vcf_file_id.close()
    output_purged_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The purged file {os.path.basename(output_purged_file)} is created.')

    # delete temporal VCF file
    os.remove(temporal_vcf_file)
    xlib.Message.print(
        'info',
        f'The temporal VCF file {os.path.basename(temporal_vcf_file)} is deleted.'
    )
Пример #8
0
def change_value(input_vcf_file, value, new_value, output_purged_file):
    '''
    Change a value in left and right sides of sample genotypes by a new value in a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # open the output purged file
    if output_purged_file.endswith('.gz'):
        try:
            output_purged_file_id = gzip.open(output_purged_file,
                                              mode='wt',
                                              encoding='iso-8859-1',
                                              newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_purged_file)
    else:
        try:
            output_purged_file_id = open(output_purged_file,
                                         mode='w',
                                         encoding='iso-8859-1',
                                         newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_purged_file)

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    changed_data = 0

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                  sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # write the metadata record
            output_purged_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # set the sample number
            sample_number = len(record_data_list) - 9

            # write the column description record
            output_purged_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_sep_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_sep_list.append(sep)
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:])

            # change the value in left and right sides of sample genotypes
            for i in range(sample_number):
                if sample_gt_left_list[i] == value:
                    sample_gt_left_list[i] = new_value
                    changed_data += 1
                if sample_gt_right_list[i] == value:
                    sample_gt_right_list[i] = new_value
                    changed_data += 1

            # rebuild the list of the field GT for every sample
            for i in range(sample_number):
                sample_gt_list[
                    i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}'

            # rebuild the alternative alleles and its corresponding record data
            alternative_alleles = ','.join(alternative_allele_list)

            # rebuild the sample genotype data list and their corresponding record data
            sample_list = []
            for i in range(sample_number):
                sample_data_list[gt_position] = sample_gt_list[i]
                sample_list.append(':'.join(sample_data_list))

            # write the variant record
            sample_list_text = '\t'.join(sample_list)
            output_purged_file_id.write(
                f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n'
            )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    output_purged_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The purged file {os.path.basename(output_purged_file)} is created.')
Пример #9
0
def build_haplotype(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id,
                    hybrid_id, haplotype_file, tvi_list):
    '''
    Builds the haplotype of a sample set from a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    total_seq_counter = 0

    # initialize the sample information list
    sample_info_list = []

    # initialize the sequence code list
    seq_code_list = []

    # initialize the haplotype matrix (rows: sequences; columns: samples)
    haplotype_matrix = []

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # open the VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # read the first record of the VCF file
    (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                  sample_number)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample information list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                if species_id == sp1_id:
                    numeric_species_id = 1
                elif species_id == sp2_id:
                    numeric_species_id = 2
                else:
                    numeric_species_id = 3
                sample_info_list.append(
                    [record_data_list[i], numeric_species_id])

            # check if the sample information list is empty
            if sample_info_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(sample_info_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant records
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the total sequence counter
            total_seq_counter += 1

            # set the old key
            old_key = key

            # append sequence identification to the sequence code list
            seq_code_list.append(data_dict['chrom'])

            # initialize the sequence haplotype list
            seq_haplotype_list = []

            while record != '' and not record.startswith(
                    '##') and not record.startswith(
                        '#CHROM') and old_key == key:

                # add 1 to the read sequence counter
                input_record_counter += 1

                # add 1 to the total variant counter
                total_variant_counter += 1

                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                    )

                # get the reference bases (field REF) and alternative alleles (field ALT)
                reference_bases = data_dict['ref']
                alternative_alleles = data_dict['alt']
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       f'reference_bases: {reference_bases}')

                # build the alternative alleles list from field ALT
                alternative_allele_list = data_dict['alt'].split(',')
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'alternative_allele_list: {alternative_allele_list}')

                # check if the variant is an indel (to SAMtools/BCFtools and Freebayes)
                is_indel = False
                if len(reference_bases) > 1:
                    is_indel = True
                else:
                    for alternative_allele in alternative_allele_list:
                        if len(alternative_allele) > 1:
                            is_indel = True
                            break
                if variant_id in tvi_list:
                    xlib.Message.print('trace', f'INDEL?: {is_indel}')

                # get the position of the genotype (subfield GT) in the field FORMAT
                format_subfield_list = data_dict['format'].upper().split(':')
                try:
                    gt_position = format_subfield_list.index('GT')
                except Exception as e:
                    raise xlib.ProgramException(e, 'L007', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

                # build the list of sample genotypes of a variant
                sample_gt_list = []
                for i in range(sample_number):
                    sample_data_list = data_dict['sample_list'][i].split(':')
                    sample_gt_list.append(sample_data_list[gt_position])

                # build the sample nucleotide list of a variant
                sample_nuclotide_list = []
                for i in range(sample_number):

                    # if the variant is not an INDEL:
                    if not is_indel:
                        sep = '/'
                        sep_pos = sample_gt_list[i].find(sep)
                        if sep_pos == -1:
                            sep = '|'
                            sep_pos = sample_gt_list[i].find(sep)
                        if sep_pos == -1:
                            raise xlib.ProgramException(
                                'L008', 'GT', data_dict['chrom'],
                                data_dict['pos'])
                        if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol(
                        ) or sample_gt_list[i][sep_pos +
                                               1:] == xlib.get_md_symbol():
                            nucleotide = 'N'
                        elif sample_gt_list[
                                i][:sep_pos] == imputed_md_id or sample_gt_list[
                                    i][sep_pos + 1:] == imputed_md_id:
                            nucleotide = 'U'
                        else:
                            try:
                                left_number = int(sample_gt_list[i][:sep_pos])
                                right_number = int(sample_gt_list[i][sep_pos +
                                                                     1:])
                                if left_number == 0:
                                    left_nucleotide = reference_bases
                                else:
                                    left_nucleotide = alternative_allele_list[
                                        left_number - 1]
                                if right_number == 0:
                                    right_nucleotide = reference_bases
                                else:
                                    right_nucleotide = alternative_allele_list[
                                        right_number - 1]
                                if left_nucleotide == right_nucleotide:
                                    nucleotide = right_nucleotide
                                else:
                                    nucleotide = xlib.get_nucleotide_list_symbol(
                                        [left_nucleotide, right_nucleotide])
                                    if nucleotide == '':
                                        raise xlib.ProgramException(
                                            '', 'D004', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])
                            except Exception as e:
                                raise xlib.ProgramException(
                                    e, 'L008', 'GT', data_dict['chrom'],
                                    data_dict['pos'])

                    # if the variant is an INDEL
                    else:
                        nucleotide = '_'

                    # append nucleotide to the sample nucleotide list of a variant
                    sample_nuclotide_list.append(nucleotide)

                # concat sample nucleotide list of a variant to sequence haplotype list
                if seq_haplotype_list == []:
                    seq_haplotype_list = sample_nuclotide_list
                else:
                    for i in range(sample_number):
                        seq_haplotype_list[i] += f'-{sample_nuclotide_list[i]}'

                # print the counters
                xlib.Message.print(
                    'verbose',
                    f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}'
                )

                # read the next record of VCF file
                (record, key,
                 data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                 sample_number)

            # append a row to haplotype matrix (rows: sequences; columns: samples)
            haplotype_matrix.append(seq_haplotype_list)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    input_vcf_file_id.close()

    # open the output haplotype file
    if haplotype_file.endswith('.gz'):
        try:
            haplotype_file_id = gzip.open(haplotype_file,
                                          mode='wt',
                                          encoding='iso-8859-1',
                                          newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', haplotype_file)
    else:
        try:
            haplotype_file_id = open(haplotype_file,
                                     mode='w',
                                     encoding='iso-8859-1',
                                     newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', haplotype_file)

    ## write header record
    #header_record = f'sample_id;species_id;{";"'.join(seq_code_list)}\n'
    #haplotype_file_id.write(header_record)

    ## write sample records
    #for i in range(sample_number):

    #    # build the sample haplotype list corresponding to the sample i from the haplotype matrix (rows: sequences; columns: samples)
    #    sample_haplotype_list = []
    #    for j in range(total_seq_counter):
    #        sample_haplotype_list.append(haplotype_matrix[j][i])

    #    # write the record of the sample
    #    sample_record = f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";"".join(sample_haplotype_list)}\n'
    #    haplotype_file_id.write(sample_record)

    # write FASTA sequences per sequence and sample
    for i in range(total_seq_counter):
        for j in range(sample_number):

            # write haplotype identification record
            haplotype_id_record = f'>{seq_code_list[i]}-{sample_info_list[j][0]}\n'
            haplotype_file_id.write(haplotype_id_record)

            #write haplotype sequence record
            haplotype_seq_record = f'{haplotype_matrix[i][j]}\n'
            haplotype_file_id.write(haplotype_seq_record)

    # close file
    haplotype_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The converted file {os.path.basename(haplotype_file)} is created.')
Пример #10
0
def extract_ff_features(input_gff_file, gff_format, vcf_file, output_gff_file):
    '''
    Extract genomic features from a GFF file corresponding to the variant of a VCF file.
    '''

    # initialize the variant dictionary
    variant_dict = {}

    # open the VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # initialize counters
    record_counter = 0
    variant_counter = 0

    # read the first record of VCF file
    (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}')

            # read the next record of the VCF file
            (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}')

            # read the next record of the VCF file
            (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False)

        # process variant records
        while record != '' and not record.startswith('##') and not record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1
 
            # add 1 to the variant counter
            variant_counter += 1

            # add the sequence and position to the variant dictionary
            position_list = variant_dict.get(data_dict['chrom'], [])
            try:
                pos = int(data_dict['pos'])
            except Exception as e:
                raise xlib.ProgramException(e, 'L005', data_dict['chrom'], data_dict['pos'])
            position_list.append(pos)
            variant_dict[data_dict['chrom']] = position_list

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}')

            # read the next record of the VCF file
            (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False)

    xlib.Message.print('verbose', '\n')

    # close VCF file
    vcf_file_id.close()

    # open the input GFF file
    if input_gff_file.endswith('.gz'):
        try:
            input_gff_file_id = gzip.open(input_gff_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_gff_file)
    else:
        try:
            input_gff_file_id = open(input_gff_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_gff_file)

    # open the output GFF file
    if output_gff_file.endswith('.gz'):
        try:
            output_gff_file_id = gzip.open(output_gff_file, mode='wt', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_gff_file)
    else:
        try:
            output_gff_file_id = open(output_gff_file, mode='w', encoding='iso-8859-1', newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_gff_file)

    # initialize counters
    input_record_counter = 0
    output_record_counter = 0

    # read the first record
    record = input_gff_file_id.readline()

    # while there are records
    while record != '':

        # add 1 to input record counter
        input_record_counter += 1

        # process data records
        if not record.startswith('#'):

            # extract data
            # record format: seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes
            data_list = []
            pos_1 = 0
            for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']:
                data_list.append(record[pos_1:pos_2].strip())
                pos_1 = pos_2 + 1
            data_list.append(record[pos_1:].strip('\n').strip())
            try:
                seq_id = data_list[0]
                start = int(data_list[3])
                end = int(data_list[4])
            except Exception as e:
                raise xlib.ProgramException(e, 'F009', os.path.basename(input_gff_file), record_counter)

            # get the position of the sequence identification from the variant dictionary
            position_list = variant_dict.get(seq_id, [])

            # check if the feature has variants
            are_there_variants = False
            found_position_list = []
            for position in position_list:
                if position >= start and position <= end:
                    are_there_variants = True
                    found_position_list.append(str(position))

            # if the feature has variants, write in the output file
            if are_there_variants == True:
                fragment_id = f'{seq_id[:seq_id.find(".")]}_{"-".join(found_position_list)}'
                output_record = f'{record.strip()}\t{",".join(found_position_list)}\t{fragment_id}\n'
                output_gff_file_id.write(output_record)
                output_record_counter += 1

        # print record counter
        xlib.Message.print('verbose', f'\rGFF file: {input_record_counter} processed records - {output_record_counter} selected records.')

        # read the next record
        record = input_gff_file_id.readline()

    xlib.Message.print('verbose', '\n')

    # close files
    input_gff_file_id.close()
    output_gff_file_id.close()
Пример #11
0
def impute_adults(input_vcf_file, sample_file, fix, scenario,
                  min_aa_percentage, min_md_imputation_percentage,
                  imputed_md_id, sp1_id, sp1_max_md_percentage, sp2_id,
                  sp2_max_md_percentage, hybrid_id, min_afr_percentage,
                  min_depth, output_vcf_file, tvi_list):
    '''
    Filter and fixes variant data of a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # calculate the adult individual number of both species and hybrids
    adult_num_1 = 0
    adult_num_2 = 0
    adult_num_h = 0
    for key, value in sample_dict.items():
        if value['mother_id'] == 'NONE':
            if value['species_id'] == sp1_id:
                adult_num_1 += 1
            elif value['species_id'] == sp2_id:
                adult_num_2 += 1
            else:
                adult_num_h += 1
    xlib.Message.print(
        'verbose',
        f'{sp1_id} adults: {adult_num_1} - {sp2_id} adults: {adult_num_2} - hybrid adults: {adult_num_h}\n'
    )

    # initialize the sample species and mother identification lists per variant
    species_id_list = []
    mother_id_list = []

    # initialize the non-filtered sequence identification list
    non_filtered_seq_id_list = []

    # set the temporal VCF file
    temporal_vcf_file = f'{output_vcf_file}.tmp'

    # open the input VCF file
    if input_vcf_file.endswith('.gz'):
        try:
            input_vcf_file_id = gzip.open(input_vcf_file,
                                          mode='rt',
                                          encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', input_vcf_file)
    else:
        try:
            input_vcf_file_id = open(input_vcf_file,
                                     mode='r',
                                     encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', input_vcf_file)

    # open the temporal VCF file
    if temporal_vcf_file.endswith('.gz'):
        try:
            temporal_vcf_file_id = gzip.open(temporal_vcf_file,
                                             mode='wt',
                                             encoding='iso-8859-1',
                                             newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', temporal_vcf_file)
    else:
        try:
            temporal_vcf_file_id = open(temporal_vcf_file,
                                        mode='w',
                                        encoding='iso-8859-1',
                                        newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', temporal_vcf_file)

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    filtered_variant_counter = 0

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id,
                                                  sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # write the metadata record
            temporal_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # write the column description record
            temporal_vcf_file_id.write(record)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                )

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # check if the variant is an indel (both SAMtools/BCFtools and Freebayes)
            is_indel = False
            if len(reference_bases) > 1:
                is_indel = True
            else:
                for alternative_allele in alternative_allele_list:
                    if len(alternative_allele) > 1:
                        is_indel = True
                        break
            if variant_id in tvi_list:
                xlib.Message.print('trace', f'(1) INDEL?: {is_indel}')

            # get the combined depth across samples (subfield DP) from field INFO
            info_field_list = data_dict['info'].upper().split(';')
            dp = -1
            for i in range(len(info_field_list)):
                if info_field_list[i].startswith('DP='):
                    try:
                        dp = int(info_field_list[i][3:])
                    except Exception as e:
                        raise xlib.ProgramException(e, 'L008', 'DP',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                    break
            if dp == -1:
                raise xlib.ProgramException('', 'L007', 'DP',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_sep_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_sep_list.append(sep)
                if sample_gt_list[i] not in xlib.get_md_code_list():
                    try:
                        sample_gt_left_list.append(
                            int(sample_gt_list[i][:sep_pos]))
                        sample_gt_right_list.append(
                            int(sample_gt_list[i][sep_pos + 1:]))
                    except Exception as e:
                        raise xlib.ProgramException(e, 'L008', 'GT',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                else:
                    sample_gt_left_list.append(-1)
                    sample_gt_right_list.append(-1)

            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(2) reference_bases: {reference_bases}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(3) alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(4) sample_gt_list: {sample_gt_list}')

            # fix the reference base(s) when there are not individual with this reference
            if fix.upper() == 'Y':

                # check if there are samples with 0/n or 0|n in their field GT
                found_0_n = False
                for i in range(sample_number):
                    if sample_gt_left_list[i] == 0 or sample_gt_right_list[
                            i] == 0:
                        found_0_n = True
                        break

                # if there is not any sample with 0/n or 0|n in its field GT
                if not found_0_n:

                    # change the reference_base(s) and alternative alleles
                    reference_bases = alternative_allele_list[0]
                    alternative_allele_list = alternative_allele_list[1:]
                    alternative_allele_list = [
                        xlib.get_md_symbol()
                    ] if alternative_allele_list == [] else alternative_allele_list
                    if variant_id in tvi_list:
                        xlib.Message.print(
                            'trace',
                            '(5) 0 is not found, the reference_bases and alternative_allele_list have been changed.'
                        )

                    # fix the of the field GT of every sample
                    for i in range(sample_number):
                        if sample_gt_left_list[i] >= 1:
                            sample_gt_left_list[i] -= 1
                            sample_gt_right_list[i] -= 1

            # calculate the alternative allele counter per allele (2 or higher) and species and their percentages
            aa_counter_list_1 = []
            aa_counter_list_2 = []
            aa_counter_list_h = []
            for _ in range(len(alternative_allele_list)):
                aa_counter_list_1.append(0)
                aa_counter_list_2.append(0)
                aa_counter_list_h.append(0)
            for i in range(sample_number):
                if mother_id_list[i] == 'NONE':
                    if sample_gt_right_list[i] >= 2:
                        if species_id_list[i] == sp1_id:
                            aa_counter_list_1[sample_gt_right_list[i] - 1] += 1
                        elif species_id_list[i] == sp2_id:
                            aa_counter_list_2[sample_gt_right_list[i] - 1] += 1
                        else:
                            aa_counter_list_h[sample_gt_right_list[i] - 1] += 1
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(6) aa_counter_list_1: {aa_counter_list_1} - aa_counter_list_2 {aa_counter_list_2} - aa_counter_list_h: {aa_counter_list_h}'
                )
            aa_percentage_list_1 = []
            aa_percentage_list_2 = []
            aa_percentage_list_h = []
            for i in range(len(alternative_allele_list)):
                aa_percentage_list_1.append(aa_counter_list_1[i] /
                                            adult_num_1 * 100)
                aa_percentage_list_2.append(aa_counter_list_2[i] /
                                            adult_num_2 * 100)
                aa_percentage_list_h.append(aa_counter_list_h[i] /
                                            adult_num_h *
                                            100) if hybrid_id != 'NONE' else 0
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(7) aa_percentage_list_1: {aa_percentage_list_1} - aa_percentage_list_2 {aa_percentage_list_2} - aa_percentage_list_h: {aa_percentage_list_h}'
                )

            # fix the GT field of alternative alleles if the alternative allele percentage is less than the minimum percentage in every species when the variant is not a indel
            if not is_indel:

                for i in range(sample_number):
                    if sample_gt_right_list[i] >= 2:
                        if (species_id_list[i] == sp1_id and aa_percentage_list_1[sample_gt_right_list[i] - 1] < min_aa_percentage) or \
                            (species_id_list[i] == sp2_id and aa_percentage_list_2[sample_gt_right_list[i] - 1] < min_aa_percentage) or \
                            (species_id_list[i] == hybrid_id and aa_percentage_list_h[sample_gt_right_list[i] - 1] < min_aa_percentage):
                            # set missing data
                            if variant_id in tvi_list:
                                xlib.Message.print(
                                    'trace',
                                    f'(8) Setting missing data in i: {i} - sample_gt_left_list[i]: {sample_gt_left_list[i]} - sample_gt_right_list[i]: {sample_gt_right_list[i]}'
                                )
                            sample_gt_left_list[i] = -1
                            sample_gt_right_list[i] = -1

            # fix the alternative allele list when a alternative allele does not have any sample
            alternative_allele_counter_list = []
            for _ in range(len(alternative_allele_list)):
                alternative_allele_counter_list.append(0)
            for i in range(sample_number):
                if sample_gt_left_list[i] > 0:
                    alternative_allele_counter_list[sample_gt_left_list[i] -
                                                    1] += 1
                if sample_gt_right_list[i] > 0:
                    alternative_allele_counter_list[sample_gt_right_list[i] -
                                                    1] += 1
            for i in range(len(alternative_allele_counter_list) - 1, -1, -1):
                if alternative_allele_counter_list[i] == 0:
                    del alternative_allele_list[i]
                if alternative_allele_list == []:
                    alternative_allele_list = [xlib.get_md_symbol()]
                else:
                    alternative_allele_list
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(9) alternative_allele_counter_list: {alternative_allele_counter_list}'
                )

            # calculate the missing data counter per species and their percentages
            md_counter_1 = 0
            md_counter_2 = 0
            md_counter_h = 0
            for i in range(sample_number):
                if mother_id_list[i] == 'NONE':
                    if sample_gt_right_list[i] == -1:
                        if species_id_list[i] == sp1_id:
                            md_counter_1 += 1
                        elif species_id_list[i] == sp2_id:
                            md_counter_2 += 1
                        else:
                            md_counter_h += 1
            md_percentage_1 = md_counter_1 / adult_num_1 * 100
            md_percentage_2 = md_counter_2 / adult_num_2 * 100
            md_percentage_h = md_counter_h / adult_num_h * 100
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(10) {sp1_id} missing data: {md_percentage_1:5.2f}% - {sp2_id} missing data: {md_percentage_2:5.2f}% - {hybrid_id} missing data: {md_percentage_h:5.2f}%'
                )

            # when sample is an adult individual, fix the symbol of missing data of the GP field of alternative alleles if the percentage of mising data per species is greater than to the minimum percentage of missing data imputation to a new alternative allele of the corresponding species of the corresponding species or the species is hybrid with the identification of the alternative allele for imputed missing data
            for i in range(sample_number):

                # only when the sample is an adult individual
                if mother_id_list[i] == 'NONE':

                    if sample_gt_right_list[i] == -1 and \
                      (species_id_list[i] == sp1_id and md_percentage_1 > min_md_imputation_percentage or \
                       species_id_list[i] == sp2_id and md_percentage_2 > min_md_imputation_percentage or \
                       species_id_list[i] == hybrid_id and (md_percentage_1 > min_md_imputation_percentage or md_percentage_2 > min_md_imputation_percentage)):

                        sample_gt_left_list[i] = 99
                        sample_gt_right_list[i] = 99

            # get a list with the new order of the alternative alleles
            new_order_list = []
            order = 1
            for i in range(len(alternative_allele_counter_list)):
                if alternative_allele_counter_list[i] > 0:
                    new_order_list.append(order)
                    order += 1
                else:
                    new_order_list.append(0)
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(11) new_order_list: {new_order_list}')

            # check if all samples are monomorphic
            monomorphic = True
            left_allele = None
            right_allele = None
            for i in range(sample_number):
                if mother_id_list[i] == 'NONE':
                    if sample_gt_right_list[i] == 99:
                        monomorphic = False
                        break
                    elif sample_gt_right_list[i] != -1:
                        if left_allele == None:
                            left_allele = sample_gt_left_list[i]
                        if right_allele == None:
                            right_allele = sample_gt_right_list[i]
                        if left_allele != sample_gt_left_list[
                                i] or right_allele != sample_gt_right_list[i]:
                            monomorphic = False
                            break
            if variant_id in tvi_list:
                xlib.Message.print('trace', f'(12) monomorphic: {monomorphic}')

            if variant_id in tvi_list:
                literal = ' '
                for i in range(sample_number):
                    literal += f'{str(sample_gt_left_list[i])}{sample_sep_list[i]}{str(sample_gt_right_list[i])} '
                xlib.Message.print(
                    'trace',
                    f'(13) genotype list before imputation revision: {literal}'
                )

            # review depending on the scenario
            for i in range(sample_number):

                # only when the sample is an adult individual
                if mother_id_list[i] == 'NONE':

                    # revision when the scenario is '0' (no imputation) or '2' (maximum possible imputation)
                    if scenario in ['0', '2']:

                        # the sample is hybrid
                        if species_id_list[i] == hybrid_id and (
                                md_percentage_1 > min_md_imputation_percentage
                                or
                                md_percentage_2 > min_md_imputation_percentage
                        ) and sample_gt_left_list[i] == sample_gt_right_list[i]:
                            sample_gt_right_list[i] = 99
                        elif species_id_list[i] == sp1_id and (
                                md_percentage_1 > min_md_imputation_percentage
                        ) and sample_gt_left_list[i] == sample_gt_right_list[i]:
                            sample_gt_right_list[i] = 99
                        elif species_id_list[i] == sp2_id and (
                                md_percentage_2 > min_md_imputation_percentage
                        ) and sample_gt_left_list[i] == sample_gt_right_list[i]:
                            sample_gt_right_list[i] = 99

                    # revision when the scenario is '1' (standard)
                    elif scenario == '1':

                        #if sample_gt_right_list[i] == -1:
                        #    sample_gt_left_list[i] = 99
                        #    sample_gt_right_list[i] = 99
                        pass

                    # revision when the scenario is '3' (maximum possible missing data)
                    elif scenario == '3':

                        if sample_gt_left_list[i] == sample_gt_right_list[i]:
                            sample_gt_right_list[i] = -1

            if variant_id in tvi_list:
                literal = ' '
                for i in range(sample_number):
                    literal += f'{str(sample_gt_left_list[i])}{sample_sep_list[i]}{str(sample_gt_right_list[i])} '
                xlib.Message.print(
                    'trace',
                    f'(14)  genotype list after imputation revision: {literal}'
                )

            # rebuild the list of the field GT for every sample
            for i in range(sample_number):
                if sample_gt_left_list[i] == -1:
                    left = xlib.get_md_symbol()
                elif sample_gt_left_list[i] == 99:
                    left = imputed_md_id
                else:
                    left = new_order_list[
                        sample_gt_left_list[i] -
                        1] if sample_gt_left_list[i] > 0 else 0
                if sample_gt_right_list[i] == -1:
                    right = xlib.get_md_symbol()
                elif sample_gt_right_list[i] == 99:
                    right = imputed_md_id
                else:
                    right = new_order_list[
                        sample_gt_right_list[i] -
                        1] if sample_gt_right_list[i] > 0 else 0
                sample_gt_left_list[i] = left
                sample_gt_right_list[i] = right
                sample_gt_list[
                    i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}'

            # rebuild the alternative alleles and its corresponding record data
            alternative_alleles = ','.join(alternative_allele_list)

            # rebuild the sample genotype data list and their corresponding record data
            sample_list = []
            for i in range(sample_number):
                sample_data_list[gt_position] = sample_gt_list[i]
                sample_list.append(':'.join(sample_data_list))

            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(15) reference_bases: {reference_bases}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(16) alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(17) sample_gt_list: {sample_gt_list}')

            # check the allele frecuencies when the variant is not a indel
            allele_frequency_OK = True
            if not is_indel:

                # get the allele counters per species
                allele_counter_dict_1 = {}
                allele_counter_dict_2 = {}
                allele_counter_dict_h = {}
                for i in range(sample_number):
                    if mother_id_list[i] == 'NONE':
                        if sample_gt_right_list[i] != xlib.get_md_symbol():
                            if species_id_list[i] == sp1_id:
                                allele_counter_dict_1[sample_gt_left_list[
                                    i]] = allele_counter_dict_1.get(
                                        sample_gt_left_list[i], 0) + 1
                                allele_counter_dict_1[sample_gt_right_list[
                                    i]] = allele_counter_dict_1.get(
                                        sample_gt_right_list[i], 0) + 1
                            elif species_id_list[i] == sp2_id:
                                allele_counter_dict_2[sample_gt_left_list[
                                    i]] = allele_counter_dict_2.get(
                                        sample_gt_left_list[i], 0) + 1
                                allele_counter_dict_2[sample_gt_right_list[
                                    i]] = allele_counter_dict_2.get(
                                        sample_gt_right_list[i], 0) + 1
                            else:
                                allele_counter_dict_h[sample_gt_left_list[
                                    i]] = allele_counter_dict_h.get(
                                        sample_gt_left_list[i], 0) + 1
                                allele_counter_dict_h[sample_gt_right_list[
                                    i]] = allele_counter_dict_h.get(
                                        sample_gt_right_list[i], 0) + 1
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'(18) allele_counter_dict_1: {allele_counter_dict_1}')
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'(19) allele_counter_dict_2: {allele_counter_dict_2}')
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'(20) allele_counter_dict_h: {allele_counter_dict_h}')

                # check the allele frecuencies per species
                if imputed_md_id in allele_counter_dict_1.keys() and len(allele_counter_dict_1.keys()) > 3 or \
                    imputed_md_id not in allele_counter_dict_1.keys() and len(allele_counter_dict_1.keys()) > 2 or \
                    imputed_md_id in allele_counter_dict_2.keys() and len(allele_counter_dict_2.keys()) > 3 or \
                    imputed_md_id not in allele_counter_dict_2.keys() and len(allele_counter_dict_2.keys()) > 2:
                    allele_frequency_OK = False
                    if variant_id in tvi_list:
                        xlib.Message.print('trace',
                                           '(21) multiallelic variant.')
                else:
                    sp1_allele_total = 0
                    for allele in allele_counter_dict_1.keys():
                        sp1_allele_total += allele_counter_dict_1[allele]
                    for allele in allele_counter_dict_1.keys():
                        allele_frequency = allele_counter_dict_1[
                            allele] / sp1_allele_total * 100
                        if allele_frequency < min_afr_percentage:
                            allele_frequency_OK = False
                            if variant_id in tvi_list:
                                xlib.Message.print(
                                    'trace',
                                    f'(20) allele {allele} in species 1 has a frequency {allele_frequency:5.2f}% less than maf'
                                )
                    sp2_allele_total = 0
                    for allele in allele_counter_dict_2.keys():
                        sp2_allele_total += allele_counter_dict_2[allele]
                    for allele in allele_counter_dict_2.keys():
                        allele_frequency = allele_counter_dict_2[
                            allele] / sp2_allele_total * 100
                        if allele_counter_dict_2[
                                allele] / sp2_allele_total * 100 < min_afr_percentage:
                            allele_frequency_OK = False
                            if variant_id in tvi_list:
                                xlib.Message.print(
                                    'trace',
                                    f'(21) allele {allele} in species 2 has a frequency {allele_frequency:5.2f}% less than maf'
                                )

            # check if there are imputation in adult individuals when the scenario is 0 (no imputation)
            scenario0_are_there_imputations = False
            if scenario == '0':
                for i in range(sample_number):
                    if mother_id_list[i] == 'NONE' and (
                            sample_gt_left_list[i] == imputed_md_id
                            or sample_gt_right_list[i] == imputed_md_id):
                        scenario0_are_there_imputations = True
                        break

            # if DP is less than the minimum combined depth or all samples are monomorphic or the missing data percentage is greater than or equal to the missing data percentage threshold in both species or allele frequency is not OK
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'(22) dp: {dp} - md_percentage_1: {md_percentage_1:5.2f}% - md_percentage_2: {md_percentage_2:5.2f}% - allele_frequency_OK: {allele_frequency_OK}'
                )
            if dp < min_depth or monomorphic or (
                    md_percentage_1 > sp1_max_md_percentage
                    and md_percentage_2 > sp2_max_md_percentage
            ) or not allele_frequency_OK or scenario0_are_there_imputations:

                # add 1 to the filtered variant counter
                filtered_variant_counter += 1
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       '(23) This variant is deleted!!!')

            # in any other case
            else:

                # add the sequence identification to the non filtered sequence identification list
                if data_dict['chrom'] not in non_filtered_seq_id_list:
                    non_filtered_seq_id_list.append(data_dict['chrom'])

                # write the variant record
                sample_list_text = '\t'.join(sample_list)
                temporal_vcf_file_id.write(
                    f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n'
                )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close files
    input_vcf_file_id.close()
    temporal_vcf_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The temporal file {os.path.basename(temporal_vcf_file)} containing the filtered variants is created.'
    )
    xlib.Message.print('info', 'Removing metadata of filtered variants ...')

    # open the temporal VCF file
    if temporal_vcf_file.endswith('.gz'):
        try:
            temporal_vcf_file_id = gzip.open(temporal_vcf_file,
                                             mode='rt',
                                             encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', temporal_vcf_file)
    else:
        try:
            temporal_vcf_file_id = open(temporal_vcf_file,
                                        mode='r',
                                        encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', temporal_vcf_file)

    # open the output VCF file
    if output_vcf_file.endswith('.gz'):
        try:
            output_vcf_file_id = gzip.open(output_vcf_file,
                                           mode='wt',
                                           encoding='iso-8859-1',
                                           newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_vcf_file)
    else:
        try:
            output_vcf_file_id = open(output_vcf_file,
                                      mode='w',
                                      encoding='iso-8859-1',
                                      newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_vcf_file)

    # read the first record of temporal VCF file
    record = temporal_vcf_file_id.readline()

    # while there are records in temporal VCF file
    while record != '':

        # process contig records
        if record.startswith('##contig'):

            # get the sequence identification and the position
            seq_id = ''
            i1 = 13
            i2 = record.find(',', i1)
            if i2 > -1:
                seq_id = record[i1:i2]

            # write the record when the sequence identification was not filtered
            if seq_id in non_filtered_seq_id_list:
                output_vcf_file_id.write(record)

        # process other records
        else:

            # write record
            output_vcf_file_id.write(record)

        # read the next record
        record = temporal_vcf_file_id.readline()

    # close files
    temporal_vcf_file_id.close()
    output_vcf_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The file {os.path.basename(output_vcf_file)} containing the filtered variants is created.'
    )

    # delete temporal VCF file
    os.remove(temporal_vcf_file)
    xlib.Message.print(
        'info',
        f'The temporal VCF file {os.path.basename(temporal_vcf_file)} is deleted.'
    )