def merge(in_files, out_file, attach_read_group_from_file_name=False, header_file=None, compression_level=9, num_compression_threads=0): cmd = [ 'samtools', 'merge', '-c', '-p', '-f', '-l', compression_level, '-@', num_compression_threads, ] if attach_read_group_from_file_name: cmd.append('-r') if header_file is not None: cmd.extend(['-h', header_file]) cmd.append(out_file) for file_name in flatten_input(in_files): cmd.append(file_name) pypeliner.commandline.execute(*cmd) pypeliner.commandline.execute('samtools', 'index', out_file, _get_bam_index_filename(out_file))
def concatenate_tables(in_files, out_file, ignore_empty_files=False, use_gzip=True): if use_gzip: open_func = gzip.GzipFile else: open_func = open write_header = True with open_func(out_file, 'w') as out_fh: for file_name in flatten_input(in_files): try: df = pd.read_csv(file_name, sep='\t') except EmptyDataError as e: if ignore_empty_files: continue else: raise e if df.empty: continue df.to_csv(out_fh, header=write_header, index=False, sep='\t') write_header = False
def write_header_file(in_files, out_file, seq_info): bam = pysam.AlignmentFile(flatten_input(in_files)[0], mode='r', check_sq=False) header = bam.header.copy() bam.close() for x in header['PG'][0]['CL'].split('\t'): if ':' in x: key, value = x.split(':') header['PG'][0][key] = value header['PG'] = [{ 'ID': 'bwa', 'VN': header['PG'][0]['VN'], 'CL': 'bwa aln; bwa sampe' }] for entry in header['SQ']: for key in seq_info: entry[key] = seq_info[key] pysam.AlignmentFile(out_file, header=header, mode='wh').close()
def merge_vcfs(in_files, out_file): in_files = flatten_input(in_files) with open(out_file, 'w') as out_fh: write_header(out_fh) writer = csv.DictWriter( out_fh, ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'], delimiter='\t') reader = MultiVcfReader(in_files) for row in reader: writer.writerow({ 'CHROM': row.chrom, 'POS': row.coord, 'ID': '.', 'REF': row.ref, 'ALT': row.alt, 'QUAL': '.', 'FILTER': '.', 'INFO': '.' }) reader.close()
def concatenate(in_files, out_file): """ Concatenate FASTQ files. """ with gzip.GzipFile(out_file, 'w') as out_fh: for in_file in flatten_input(in_files): with gzip.GzipFile(in_file, 'r') as in_fh: out_fh.write(in_fh.read())
def concatenate_bcf(in_files, out_file): """ Fast concatenation of BCF file using `bcftools`. :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys. :param out_file: path where output file will be written in VCF format. """ cmd = ['bcftools', 'concat', '-a', '-O', 'b', '-o', out_file] cmd += flatten_input(in_files) pypeliner.commandline.execute(*cmd) index_vcf(out_file) index_bcf(out_file)
def mark_duplicates(in_files, out_file, compression_level=9, hash_table_size=262144, io_buffer_size=128, num_threads=1, overflow_list_size=200000, tmp_dir=None): try: if tmp_dir is None: tmp_dir = tempfile.mkdtemp() clean_up = True else: clean_up = False cmd = [ 'sambamba', 'markdup', '-p', '-l', compression_level, '-t', num_threads, '--io-buffer-size={0}'.format(io_buffer_size), '--hash-table-size={0}'.format(hash_table_size), '--overflow-list-size={0}'.format(overflow_list_size), '--tmpdir', tmp_dir, ] cmd.extend(flatten_input(in_files)) cmd.append(out_file) pypeliner.commandline.execute(*cmd) finally: if clean_up: shutil.rmtree(tmp_dir) pypeliner.commandline.execute('samtools', 'index', out_file, _get_bam_index_filename(out_file))
def concatenate_vcf(in_files, out_file, allow_overlap=False, docker_config={}): """ Fast concatenation of VCF file using `bcftools`. :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys. :param out_file: path where output file will be written in VCF format. """ if allow_overlap: cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', out_file] else: cmd = ['bcftools', 'concat', '-O', 'z', '-o', out_file] cmd += flatten_input(in_files) pypeliner.commandline.execute(*cmd, **docker_config) index_vcf(out_file, docker_config=docker_config) index_bcf(out_file, docker_config=docker_config)
def concatenate_tables(in_files, out_file, drop_duplicates=False, in_memory=True, non_numeric_as_category=True): in_files = flatten_input(in_files) # Only support drop duplicatess in memory if drop_duplicates or in_memory: _concatenate_tables_in_memory( in_files, out_file, drop_duplicates=drop_duplicates, non_numeric_as_category=non_numeric_as_category) else: _concatenate_tables_on_disk( in_files, out_file, non_numeric_as_category=non_numeric_as_category)
def merge(in_files, out_file, index_file=None): os.environ['MALLOC_ARENA_MAX'] = '4' cmd = [ 'picard', '-XX:ParallelGCThreads=1', '-Xmx8g', 'MergeSamFiles', 'OUTPUT={0}'.format(out_file), 'VALIDATION_STRINGENCY=LENIENT', ] for file_name in flatten_input(in_files): cmd.append('INPUT={0}'.format(file_name)) pypeliner.commandline.execute(*cmd) if index_file is not None: cmd = ['samtools', 'index', out_file, index_file] pypeliner.commandline.execute(*cmd)
def merge(in_files, reference_genome_fasta_file, out_file, attach_read_group_from_file_name=False, header_file=None, compression_level=9, num_compression_threads=0): cmd = [ 'samtools', 'merge', '-c', '-p', '-f', '-l', compression_level, '-@', num_compression_threads, '--output-fmt', 'cram', '--reference', reference_genome_fasta_file, ] if attach_read_group_from_file_name: cmd.append('-r') if header_file is not None: cmd.extend(['-h', header_file]) cmd.append(out_file) for file_name in flatten_input(in_files): cmd.append(file_name) pypeliner.commandline.execute(*cmd)