def process_vcf(archive, vcf, vcf_index, output_prefix): """ Extracts and processes the caveman vcf file. """ out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix) logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf)) extract_file(archive, vcf, out_raw_vcf) out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix) logger.info( "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index)) extract_file(archive, vcf_index, out_raw_vcf_index) # Update the sample name using BGZFile which doesn't assert any VCF format logger.info("Processing raw VCF to change TUMOUR -> TUMOR...") out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix) logger.info("Creating final vcf {0}".format(out_formatted_vcf)) writer = pysam.BGZFile(out_formatted_vcf, mode='wb') reader = pysam.BGZFile(out_raw_vcf, mode='rb') try: for line in reader: line = line.decode('utf-8') if line.startswith('##'): if line.startswith('##SAMPLE=<ID=TUMOUR'): new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: new_line = line + '\n' writer.write(new_line.encode('utf-8')) elif line.startswith('#CHROM'): new_line = line.replace('TUMOUR', 'TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: # BINF-306: fix rare case of alt == ref in caveman vcf. cols = line.split('\t') if cols[3] == cols[4]: logger.warn( "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}" .format(cols[0], cols[1], cols[3], cols[4])) continue new_line = line + '\n' writer.write(new_line.encode('utf-8')) finally: writer.close() reader.close() # tabix index logger.info("Creating final vcf index {0}".format(out_formatted_vcf + '.tbi')) pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True) # clean up logger.info("Cleaning up tmp files...") os.remove(out_raw_vcf) os.remove(out_raw_vcf_index)
def process_bedpe(archive, bedpe, bedpe_index, output_prefix): """ Extracts and processes the brass bedpe file. """ out_raw_bedpe = '{0}.tmp.bedpe.gz'.format(output_prefix) logger.info("Extracting raw bedpe to tmp file {0}".format(out_raw_bedpe)) extract_file(archive, bedpe, out_raw_bedpe) out_raw_bedpe_index = '{0}.tmp.bedpe.gz.tbi'.format(output_prefix) logger.info("Extracting raw bedpe index to tmp file {0}".format(out_raw_bedpe_index)) extract_file(archive, bedpe_index, out_raw_bedpe_index) out_formatted_bedpe = '{0}.bedpe.gz'.format(output_prefix) logger.info("Creating final bedpe {0}".format(out_formatted_bedpe)) writer = pysam.BGZFile(out_formatted_bedpe, mode='wb') reader = pysam.BGZFile(out_raw_bedpe, mode='rb') try: meta_line = None process_header = False hdr = [] for line in reader: line = line.decode('utf-8') if line.startswith('#'): meta_line = line else: if not process_header: hdr = format_header(meta_line) assert len(hdr) == len(set(hdr)), \ "Duplicate header keys {0}".format(','.join(hdr)) writer.write(('#' + '\t'.join(hdr) + '\n').encode('utf-8')) process_header = True dat = dict(zip(hdr, line.rstrip('\r\n').split('\t'))) dat['brass_notation'] = dat['brass_notation'].replace('Chr.chr', 'chr') new_line = "\t".join([dat[i] for i in hdr]) + '\n' writer.write(new_line.encode('utf-8')) finally: writer.close() reader.close() # tabix index logger.info("Creating final bedpe index {0}".format(out_formatted_bedpe + '.tbi')) pysam.tabix_index( out_formatted_bedpe, preset='bed', force=True ) # clean up logger.info("Cleaning up tmp files...") os.remove(out_raw_bedpe) os.remove(out_raw_bedpe_index)
def process_vcf(archive, vcf, vcf_index, output_prefix): """ Extracts and processes the pindel vcf file. """ out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix) logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf)) extract_file(archive, vcf, out_raw_vcf) out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix) logger.info( "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index)) extract_file(archive, vcf_index, out_raw_vcf_index) # Update the sample name using BGZFile which doesn't assert any VCF format logger.info("Processing raw VCF to change TUMOUR -> TUMOR...") out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix) logger.info("Creating final vcf {0}".format(out_formatted_vcf)) writer = pysam.BGZFile(out_formatted_vcf, mode='wb') reader = pysam.BGZFile(out_raw_vcf, mode='rb') try: for line in reader: line = line.decode('utf-8') if line.startswith('##'): if line.startswith('##SAMPLE=<ID=TUMOUR'): new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: new_line = line + '\n' writer.write(new_line.encode('utf-8')) elif line.startswith('#CHROM'): new_line = line.replace('TUMOUR', 'TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: new_line = line + '\n' writer.write(new_line.encode('utf-8')) finally: writer.close() reader.close() # tabix index logger.info("Creating final vcf index {0}".format(out_formatted_vcf + '.tbi')) pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True) # clean up logger.info("Cleaning up tmp files...") os.remove(out_raw_vcf) os.remove(out_raw_vcf_index)
def addCADD(in_VCF, in_CADD_files, out_VCF): cadds = CADD(in_CADD_files) with pysam.VariantFile(in_VCF, 'r') as ifile, pysam.BGZFile(out_VCF, 'w') as ofile: for x in ['CADD_RAW', 'CADD_PHRED']: if x in ifile.header.info: raise Exception( '{} already exists in input VCF/BCF.'.format(x)) ifile.header.add_line( '##INFO=<ID=CADD_RAW,Number=A,Type=Float,Description="Raw CADD scores">' ) ifile.header.add_line( '##INFO=<ID=CADD_PHRED,Number=A,Type=Float,Desctiption="Phred-scaled CADD scores">' ) ofile.write('{}'.format(ifile.header).encode()) for record in ifile: raw_scores = [] phred_scores = [] for alt in record.alts: cadd_raw, cadd_phred = cadds.get(record.chrom, record.pos, record.ref, alt) raw_scores.append(cadd_raw) phred_scores.append(cadd_phred) if any(x for x in raw_scores) and any(x for x in phred_scores): record.info['CADD_RAW'] = raw_scores record.info['CADD_PHRED'] = phred_scores ofile.write('{}'.format(record).encode())
def merge_coverage_files(coverage_files, out_coverage_file): with pysam.BGZFile(out_coverage_file, 'w') as oz: overlap_head = deque([]) overlap_tail = deque([]) for coverage_file in coverage_files: last_position = None with gzip.GzipFile(coverage_file['name']) as iz: for line in iz: fields = line.split('\t') chrom = fields[0] if chrom != coverage_file['chrom']: raise Exception( 'Multiple chromosomes detected within {} coverage file!' .format(coverage_file['name'])) position = long(fields[1]) if last_position is None or last_position < position: last_position = position else: raise Exception( 'Positions within {} coverage file are not in ascending order or not unique!' .format(coverage_file['name'])) while overlap_head: (overlap_position, overlap_line) = overlap_head[0] if overlap_position >= coverage_file[ 'next_leftmost_position']: raise Exception( "Overlapping regions are present in more than two coverage files!" ) if overlap_position < position: oz.write(overlap_line) overlap_head.popleft() elif overlap_position == position: overlap_head.popleft() overlap_data = json.loads( overlap_line.split('\t')[2]) data = json.loads(fields[2]) if overlap_data['mean'] > data['mean']: line = overlap_line else: break else: break if position < coverage_file['next_leftmost_position']: oz.write(line) else: overlap_tail.append((position, line)) if overlap_head: raise Exception( "Nested regions detected in two coverage files!") overlap_head = overlap_tail overlap_tail = deque([]) if overlap_tail: raise Exception("Error while merging coverage files!")
def add_percentiles(in_VCF, in_pctl_files, out_VCF): percentiles = [Percentiles(x) for x in in_pctl_files] with pysam.VariantFile(in_VCF, 'r') as ifile, pysam.BGZFile(out_VCF, 'w') as ofile: for p in percentiles: for desc in p.descriptions(): ifile.header.add_line(desc) ofile.write('{}'.format(ifile.header)) for record in ifile: for p in percentiles: for key, values in p.get(record.chrom, record.pos, record.ref, record.alts): record.info[key] = values ofile.write('{}'.format(record))
def prune(in_coverage_file, out_coverage_file, fluctuation_limit=0.25): with gzip.GzipFile(in_coverage_file, 'r') as iz, pysam.BGZFile(out_coverage_file, 'w') as oz: fields = iz.readline().split('\t') bin_data = json.loads(fields[2]) for line in iz: fields = line.split('\t') data = json.loads(fields[2]) if ((abs(bin_data['mean'] - data['mean']) > fluctuation_limit) or (abs(bin_data['median'] - data['median']) > fluctuation_limit)): write_data(oz, bin_data) bin_data = data bin_data['end'] = data['start'] write_data(oz, bin_data)
def write_species_specific_vcf_gzip(self): vcf = VCF() samples = vcf.get_sample_names(self.gvcf_file) vcf_header = vcf.get_vcf_header(self.gvcf_file) total_nb = 0 specific_snv = 0 last_chr = None with gzip.open(self.gvcf_file, 'rt') as vcf_reader, pysam.BGZFile( self.output_vcf, 'wb') as output_writer, open(self.log_file, 'w') as log_writer: output_writer.write(vcf_header.encode('utf-8')) for line in vcf_reader: line = line.rstrip() if not line.startswith('#'): snp = SNP(line, samples) specific_snv, total_nb = self.__log( snp.chrom, specific_snv, total_nb, log_writer, False) if last_chr != snp.chrom: if last_chr is not None: specific_snv, total_nb = self.__log( snp.chrom, specific_snv, total_nb, log_writer, True) last_chr = snp.chrom if self.__is_not_INDEL(snp): if not self.__is_variant_filtered( snp): # general variant quality filter if self.__is_variant_specific( snp, self.samples_interest_names ): # only species specific ones are kept output_writer.write( (snp.vcf_line + "\n").encode('utf-8')) specific_snv += 1 last_chr = snp.chrom total_nb += 1 log_writer.write( "Chromosome:\t{}\t\tSpecific/Total:\t{}/{}\n".format( last_chr, specific_snv, total_nb)) log_writer.close() output_writer.close() vcf_reader.close()
dest='out_coverage_file', required=True, help='Output JSON coverage (compressed with bgzip) file') def write(output_file, data): output_file.write('{}\t{:d}\t{:d}\t'.format(data['chrom'], data['start'], data['end']).encode()) rapidjson.dump(data, output_file) output_file.write('\n'.encode()) if __name__ == '__main__': args = argparser.parse_args() with gzip.open(args.in_coverage_file, 'rt') as ifile, pysam.BGZFile(args.out_coverage_file, 'w') as ofile: line = ifile.readline() if not line: sys.exit(0) chrom, start, stop, data = line.rstrip().split('\t') bin_data = rapidjson.loads(data) for line in ifile: chrom, start, stop, data = line.rstrip().split('\t') data = rapidjson.loads(data) if (abs(bin_data['mean'] - data['mean']) > args.fluctuation_limit ) or (abs(bin_data['median'] - data['median']) > args.fluctuation_limit): write(ofile, bin_data) bin_data = data else: bin_data['end'] = data['end']
'Output file of depth information compressed with bgzip. In addition to this file, the tabix index will be produced.' ) if __name__ == '__main__': args = argparser.parse_args() file_names = [] with open(args.in_files_list, 'r') as ifile: for line in ifile: line = line.strip() if line: file_names.append(line) chromosomes = set() positions = dict() n_indv = len(file_names) breaks = [1, 5, 10, 15, 20, 25, 30, 50, 100] with ExitStack() as stack, pysam.BGZFile(args.out_file_name, 'w') as ofile: ifiles = [ stack.enter_context(gzip.open(file_name, 'rt')) for file_name in file_names ] while True: for i, ifile in enumerate(ifiles): line = ifile.readline() if line: chromosome, position, dp = line.rstrip().split() chromosomes.add(chromosome) if len(chromosomes) > 1: raise Exception( f'Multiple chromosomes detected in input files, but only one is allowed.' ) positions.setdefault(int(position), []).append(int(dp))