Exemplo n.º 1
0
def process_vcf(archive, vcf, vcf_index, output_prefix):
    """
    Extracts and processes the caveman vcf file.
    """
    out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix)
    logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf))
    extract_file(archive, vcf, out_raw_vcf)

    out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix)
    logger.info(
        "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index))
    extract_file(archive, vcf_index, out_raw_vcf_index)

    # Update the sample name using BGZFile which doesn't assert any VCF format
    logger.info("Processing raw VCF to change TUMOUR -> TUMOR...")
    out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix)
    logger.info("Creating final vcf {0}".format(out_formatted_vcf))
    writer = pysam.BGZFile(out_formatted_vcf, mode='wb')
    reader = pysam.BGZFile(out_raw_vcf, mode='rb')
    try:
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('##'):
                if line.startswith('##SAMPLE=<ID=TUMOUR'):
                    new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n'
                    writer.write(new_line.encode('utf-8'))
                else:
                    new_line = line + '\n'
                    writer.write(new_line.encode('utf-8'))
            elif line.startswith('#CHROM'):
                new_line = line.replace('TUMOUR', 'TUMOR') + '\n'
                writer.write(new_line.encode('utf-8'))
            else:
                # BINF-306: fix rare case of alt == ref in caveman vcf.
                cols = line.split('\t')
                if cols[3] == cols[4]:
                    logger.warn(
                        "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}"
                        .format(cols[0], cols[1], cols[3], cols[4]))
                    continue
                new_line = line + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final vcf index {0}".format(out_formatted_vcf +
                                                      '.tbi'))
    pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True)

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_vcf)
    os.remove(out_raw_vcf_index)
Exemplo n.º 2
0
def process_bedpe(archive, bedpe, bedpe_index, output_prefix):
    """
    Extracts and processes the brass bedpe file.
    """
    out_raw_bedpe = '{0}.tmp.bedpe.gz'.format(output_prefix)
    logger.info("Extracting raw bedpe to tmp file {0}".format(out_raw_bedpe))
    extract_file(archive, bedpe, out_raw_bedpe)

    out_raw_bedpe_index = '{0}.tmp.bedpe.gz.tbi'.format(output_prefix)
    logger.info("Extracting raw bedpe index to tmp file {0}".format(out_raw_bedpe_index))
    extract_file(archive, bedpe_index, out_raw_bedpe_index)

    out_formatted_bedpe = '{0}.bedpe.gz'.format(output_prefix)
    logger.info("Creating final bedpe {0}".format(out_formatted_bedpe))
    writer = pysam.BGZFile(out_formatted_bedpe, mode='wb')
    reader = pysam.BGZFile(out_raw_bedpe, mode='rb')
    try:
        meta_line = None
        process_header = False
        hdr = []
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('#'):
                meta_line = line
            else:
                if not process_header:
                    hdr = format_header(meta_line)
                    assert len(hdr) == len(set(hdr)), \
                        "Duplicate header keys {0}".format(','.join(hdr))
                    writer.write(('#' + '\t'.join(hdr) + '\n').encode('utf-8'))
                    process_header = True

                dat = dict(zip(hdr, line.rstrip('\r\n').split('\t')))
                dat['brass_notation'] = dat['brass_notation'].replace('Chr.chr', 'chr')
                new_line = "\t".join([dat[i] for i in hdr]) + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final bedpe index {0}".format(out_formatted_bedpe + '.tbi'))
    pysam.tabix_index( out_formatted_bedpe, preset='bed', force=True )

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_bedpe)
    os.remove(out_raw_bedpe_index)
Exemplo n.º 3
0
def process_vcf(archive, vcf, vcf_index, output_prefix):
    """
    Extracts and processes the pindel vcf file.
    """
    out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix)
    logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf))
    extract_file(archive, vcf, out_raw_vcf)

    out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix)
    logger.info(
        "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index))
    extract_file(archive, vcf_index, out_raw_vcf_index)

    # Update the sample name using BGZFile which doesn't assert any VCF format
    logger.info("Processing raw VCF to change TUMOUR -> TUMOR...")
    out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix)
    logger.info("Creating final vcf {0}".format(out_formatted_vcf))
    writer = pysam.BGZFile(out_formatted_vcf, mode='wb')
    reader = pysam.BGZFile(out_raw_vcf, mode='rb')
    try:
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('##'):
                if line.startswith('##SAMPLE=<ID=TUMOUR'):
                    new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n'
                    writer.write(new_line.encode('utf-8'))
                else:
                    new_line = line + '\n'
                    writer.write(new_line.encode('utf-8'))
            elif line.startswith('#CHROM'):
                new_line = line.replace('TUMOUR', 'TUMOR') + '\n'
                writer.write(new_line.encode('utf-8'))
            else:
                new_line = line + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final vcf index {0}".format(out_formatted_vcf +
                                                      '.tbi'))
    pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True)

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_vcf)
    os.remove(out_raw_vcf_index)
Exemplo n.º 4
0
def addCADD(in_VCF, in_CADD_files, out_VCF):
    cadds = CADD(in_CADD_files)
    with pysam.VariantFile(in_VCF, 'r') as ifile, pysam.BGZFile(out_VCF,
                                                                'w') as ofile:
        for x in ['CADD_RAW', 'CADD_PHRED']:
            if x in ifile.header.info:
                raise Exception(
                    '{} already exists in input VCF/BCF.'.format(x))
        ifile.header.add_line(
            '##INFO=<ID=CADD_RAW,Number=A,Type=Float,Description="Raw CADD scores">'
        )
        ifile.header.add_line(
            '##INFO=<ID=CADD_PHRED,Number=A,Type=Float,Desctiption="Phred-scaled CADD scores">'
        )
        ofile.write('{}'.format(ifile.header).encode())
        for record in ifile:
            raw_scores = []
            phred_scores = []
            for alt in record.alts:
                cadd_raw, cadd_phred = cadds.get(record.chrom, record.pos,
                                                 record.ref, alt)
                raw_scores.append(cadd_raw)
                phred_scores.append(cadd_phred)
            if any(x for x in raw_scores) and any(x for x in phred_scores):
                record.info['CADD_RAW'] = raw_scores
                record.info['CADD_PHRED'] = phred_scores
            ofile.write('{}'.format(record).encode())
Exemplo n.º 5
0
def merge_coverage_files(coverage_files, out_coverage_file):
    with pysam.BGZFile(out_coverage_file, 'w') as oz:
        overlap_head = deque([])
        overlap_tail = deque([])
        for coverage_file in coverage_files:
            last_position = None
            with gzip.GzipFile(coverage_file['name']) as iz:
                for line in iz:
                    fields = line.split('\t')
                    chrom = fields[0]
                    if chrom != coverage_file['chrom']:
                        raise Exception(
                            'Multiple chromosomes detected within {} coverage file!'
                            .format(coverage_file['name']))
                    position = long(fields[1])

                    if last_position is None or last_position < position:
                        last_position = position
                    else:
                        raise Exception(
                            'Positions within {} coverage file are not in ascending order or not unique!'
                            .format(coverage_file['name']))

                    while overlap_head:
                        (overlap_position, overlap_line) = overlap_head[0]

                        if overlap_position >= coverage_file[
                                'next_leftmost_position']:
                            raise Exception(
                                "Overlapping regions are present in more than two coverage files!"
                            )

                        if overlap_position < position:
                            oz.write(overlap_line)
                            overlap_head.popleft()
                        elif overlap_position == position:
                            overlap_head.popleft()
                            overlap_data = json.loads(
                                overlap_line.split('\t')[2])
                            data = json.loads(fields[2])
                            if overlap_data['mean'] > data['mean']:
                                line = overlap_line
                            else:
                                break
                        else:
                            break

                    if position < coverage_file['next_leftmost_position']:
                        oz.write(line)
                    else:
                        overlap_tail.append((position, line))

            if overlap_head:
                raise Exception(
                    "Nested regions detected in two coverage files!")
            overlap_head = overlap_tail
            overlap_tail = deque([])

        if overlap_tail:
            raise Exception("Error while merging coverage files!")
Exemplo n.º 6
0
def add_percentiles(in_VCF, in_pctl_files, out_VCF):
    percentiles = [Percentiles(x) for x in in_pctl_files]
    with pysam.VariantFile(in_VCF, 'r') as ifile, pysam.BGZFile(out_VCF, 'w') as ofile:
        for p in percentiles:
            for desc in p.descriptions():
                ifile.header.add_line(desc)
        ofile.write('{}'.format(ifile.header))
        for record in ifile:
            for p in percentiles:
                for key, values in p.get(record.chrom, record.pos, record.ref, record.alts):
                    record.info[key] = values
            ofile.write('{}'.format(record))
Exemplo n.º 7
0
def prune(in_coverage_file, out_coverage_file, fluctuation_limit=0.25):
    with gzip.GzipFile(in_coverage_file,
                       'r') as iz, pysam.BGZFile(out_coverage_file, 'w') as oz:
        fields = iz.readline().split('\t')
        bin_data = json.loads(fields[2])
        for line in iz:
            fields = line.split('\t')
            data = json.loads(fields[2])
            if ((abs(bin_data['mean'] - data['mean']) > fluctuation_limit)
                    or (abs(bin_data['median'] - data['median']) >
                        fluctuation_limit)):
                write_data(oz, bin_data)
                bin_data = data
            bin_data['end'] = data['start']
        write_data(oz, bin_data)
Exemplo n.º 8
0
 def write_species_specific_vcf_gzip(self):
     vcf = VCF()
     samples = vcf.get_sample_names(self.gvcf_file)
     vcf_header = vcf.get_vcf_header(self.gvcf_file)
     total_nb = 0
     specific_snv = 0
     last_chr = None
     with gzip.open(self.gvcf_file, 'rt') as vcf_reader, pysam.BGZFile(
             self.output_vcf,
             'wb') as output_writer, open(self.log_file, 'w') as log_writer:
         output_writer.write(vcf_header.encode('utf-8'))
         for line in vcf_reader:
             line = line.rstrip()
             if not line.startswith('#'):
                 snp = SNP(line, samples)
                 specific_snv, total_nb = self.__log(
                     snp.chrom, specific_snv, total_nb, log_writer, False)
                 if last_chr != snp.chrom:
                     if last_chr is not None:
                         specific_snv, total_nb = self.__log(
                             snp.chrom, specific_snv, total_nb, log_writer,
                             True)
                     last_chr = snp.chrom
                 if self.__is_not_INDEL(snp):
                     if not self.__is_variant_filtered(
                             snp):  # general variant quality filter
                         if self.__is_variant_specific(
                                 snp, self.samples_interest_names
                         ):  # only species specific ones are kept
                             output_writer.write(
                                 (snp.vcf_line + "\n").encode('utf-8'))
                             specific_snv += 1
                             last_chr = snp.chrom
                 total_nb += 1
         log_writer.write(
             "Chromosome:\t{}\t\tSpecific/Total:\t{}/{}\n".format(
                 last_chr, specific_snv, total_nb))
         log_writer.close()
         output_writer.close()
         vcf_reader.close()
Exemplo n.º 9
0
    dest='out_coverage_file',
    required=True,
    help='Output JSON coverage (compressed with bgzip) file')


def write(output_file, data):
    output_file.write('{}\t{:d}\t{:d}\t'.format(data['chrom'], data['start'],
                                                data['end']).encode())
    rapidjson.dump(data, output_file)
    output_file.write('\n'.encode())


if __name__ == '__main__':
    args = argparser.parse_args()
    with gzip.open(args.in_coverage_file,
                   'rt') as ifile, pysam.BGZFile(args.out_coverage_file,
                                                 'w') as ofile:
        line = ifile.readline()
        if not line:
            sys.exit(0)
        chrom, start, stop, data = line.rstrip().split('\t')
        bin_data = rapidjson.loads(data)
        for line in ifile:
            chrom, start, stop, data = line.rstrip().split('\t')
            data = rapidjson.loads(data)
            if (abs(bin_data['mean'] - data['mean']) > args.fluctuation_limit
                ) or (abs(bin_data['median'] - data['median']) >
                      args.fluctuation_limit):
                write(ofile, bin_data)
                bin_data = data
            else:
                bin_data['end'] = data['end']
Exemplo n.º 10
0
    'Output file of depth information compressed with bgzip. In addition to this file, the tabix index will be produced.'
)

if __name__ == '__main__':
    args = argparser.parse_args()
    file_names = []
    with open(args.in_files_list, 'r') as ifile:
        for line in ifile:
            line = line.strip()
            if line:
                file_names.append(line)
    chromosomes = set()
    positions = dict()
    n_indv = len(file_names)
    breaks = [1, 5, 10, 15, 20, 25, 30, 50, 100]
    with ExitStack() as stack, pysam.BGZFile(args.out_file_name, 'w') as ofile:
        ifiles = [
            stack.enter_context(gzip.open(file_name, 'rt'))
            for file_name in file_names
        ]
        while True:
            for i, ifile in enumerate(ifiles):
                line = ifile.readline()
                if line:
                    chromosome, position, dp = line.rstrip().split()
                    chromosomes.add(chromosome)
                    if len(chromosomes) > 1:
                        raise Exception(
                            f'Multiple chromosomes detected in input files, but only one is allowed.'
                        )
                    positions.setdefault(int(position), []).append(int(dp))