def partition(samples_path, num_partitions): samples = [line.strip() for line in zopen(samples_path)] part_size = float(len(samples)) / num_partitions partition_ends = [int((p+1) * part_size) for p in range(num_partitions)] print(partition_ends) patient_ids = [] num_without_pid = 0 for s in samples: m = re.search('TCGA-..-....', s) if not m: num_without_pid += 1 patient_ids.append(m.group(0) if m else 'zzz' + s) if num_without_pid: info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid) samples, patient_ids = zip(*sorted(zip(samples, patient_ids), key=lambda x: x[1])) partitions = [] for p in range(num_partitions): first = sum(len(p) for p in partitions) last = partition_ends[p] - 1 part = [s for s in samples[first:last+1]] while last + 1 < len(samples) and \ patient_ids[last+1] == patient_ids[last]: part.append(samples[last+1]) last += 1 partitions.append(part) for idx, part in enumerate(partitions): out = open('batch_%d.txt' % (idx+1), 'w') for s in part: out.write('%s\n' % s) out.close()
def variant_discard_by_position(vcf_path, pos_path): info('Reading list of blacklisted positions...') pos_file = zopen(pos_path) blacklist = [] for line in pos_file: cols = line.rstrip().split('\t') if len(cols) < 2: continue chr = cols[0][3:] if cols[0].startswith('chr') else cols[0] blacklist.append(chr + ':' + cols[1]) blacklist = set(blacklist) vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 sys.stdout.write(line) for line in vcf_file: cols = line.rstrip().split('\t') chr = cols[0][3:] if cols[0].startswith('chr') else cols[0] if not chr + ':' + cols[1] in blacklist: sys.stdout.write(line)
def discard_if_in_controls(vcf_path, control_samples, threshold): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 control = [ any(re.search(rx, s) for rx in control_samples) for s in headers[sample_col:] ] if not any(control): error('No control samples found.') info('Using these %d control samples:' % sum(control)) for s, c in zip(headers[sample_col:], control): if c: info('- %s' % s) sys.stdout.write(line) for line in vcf_file: cols = line.rstrip('\n').split('\t')[sample_col:] genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols] if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold: continue sys.stdout.write(line)
def fasta_remove_adapters(fasta_path, adapter): # Convert the adapter into a regular expression if len(adapter) < 5: error('Adapter sequence is too short.') adapter_re = adapter[:5] for base in adapter[5:]: adapter_re += '(?:' + base adapter_re += (len(adapter) - 5) * ')?' adapter_re = re.compile(adapter_re) info('Adapter regular expression: %s' % adapter_re) fasta = zopen(fasta_path) for line in fasta: if line[0] == '#': sys.stdout.write(line) elif line[0] == '>': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) if m: seq = seq[:m.start()] print(seq) elif line[0] == '@': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) trim_len = m.start() if m else len(seq) print(seq[:trim_len]) sys.stdout.write(next(fasta)) print(next(fasta)[:trim_len])
def parallel(command, job_name, max_workers, cpus, memory, partition, time_limit): # Allow splitting the command string onto multiple lines. command = command.replace('\n', ' ') if sys.stdin.isatty(): # If the user did not provide any input, just run the command once. # The command must not contain $x. if '$x' in command or '${x' in command: error('Command contains $x but no targets provided.') targets = [''] else: # Parse whitespace-delimited target items from standard input. targets = [] for line in sys.stdin: targets += line.split(' ') targets = [t.replace('\n', '') for t in targets] if not targets: error('Command requires targets but none provided.') if len(set(targets)) < len(targets): error('Target list contains multiple instances of the following targets:\n' + '\n'.join(s for s in set(targets) if targets.count(s) > 1)) if max_workers > len(targets): max_workers = len(targets) if partition != 'local': info('Distributing %d %s named "%s" on %s partition ' '(with %d %s and %d GB of memory per job).' % ( len(targets), 'jobs' if len(targets) != 1 else 'job', job_name, partition, cpus, 'CPUs' if cpus != 1 else 'CPU', memory)) else: info('Starting %d %s named "%s" on local machine.' % ( len(targets), 'jobs' if len(targets) != 1 else 'job', job_name)) log_dir = os.path.expanduser('~/.jobs/%s_%s' % (job_name, datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))) os.makedirs(log_dir) with open('%s/tasks' % log_dir, 'w') as f: f.write('%s\n' % command) for target in targets: f.write('%s\n' % target) if partition == 'local': worker_cmd = ['parallel', 'worker', log_dir] workers = [subprocess.Popen(worker_cmd) for w in range(max_workers)] for w in workers: w.wait() else: # Run the job steps on a SLURM cluster using sbatch. # Required memory is given in GB per job step. Convert to MB per CPU. mem_per_cpu = round(float(memory) / cpus * 1000) sbatch_script = sbatch_template % (partition, job_name, cpus, mem_per_cpu, 60 * time_limit, log_dir, log_dir, log_dir) workers = [subprocess.Popen(['sbatch', '-Q'], stdin=subprocess.PIPE) for p in range(max_workers)] for w in workers: w.stdin.write(sbatch_script.encode('utf-8')) w.stdin.close() for w in workers: w.wait()
def cghub_list(samples): for s in samples: print('%s\t%s\t%s\t%s' % (s.files[0], s.legacy_sample_id, s.ref_genome, s.center)) #print('%s\t%s\t%s' % (s.files[0], s.filesizes[0], s.center)) info('Found a total of %d samples.' % len(samples)) info('Total filesize: %.1f GB.' % (sum(s.filesizes[0] for s in samples) / 1e9))
def smallrna_parse_mirbase(mirbase_gff_path): info('Printing mature microRNA loci in BED format.') mirna_name_re = re.compile(r';Name=([^\s;]+)') for line in open(mirbase_gff_path): if line[0] == '#': continue tokens = line[:-1].split('\t') if tokens[2] != 'miRNA': continue print('%s\t%d\t%d\t%s' % (tokens[0], int(tokens[3]) - 1, int( tokens[4]), mirna_name_re.search(tokens[8]).group(1)))
def sam_pileup(region, bam_paths, min_al_quality=0): # Check the file paths here to ensure a nicer error message if files are # missing. missing = [path for path in bam_paths if not os.path.isfile(path)] for path in missing: info('WARNING: File %s was not found.' % path) bam_paths = [path for path in bam_paths if os.path.isfile(path)] if not bam_paths: return chr, region = region.replace(' ', '').split(':') region = [int(x) for x in region.split('-')] if len(region) == 1: region *= 2 dev_null = open('/dev/null', 'a') indel_rx = re.compile('(\w[+-]\d+)?(\w+)(?![+-])') for pos in range(region[0], region[1] + 1): if region[0] != region[1]: print('Pileup for %s:%d:' % (chr, pos)) for bam in bam_paths: line = subprocess.check_output( 'samtools mpileup -A -B -q%d -r %s:%d-%d %s' % (min_al_quality, chr, pos, pos, bam), shell=True, stderr=dev_null) sample_name = re.sub(r'(.*/)?(.*).bam', r'\2', bam) if not line: print('%s\t' % sample_name) else: tokens = line[:-1].split('\t') bases = re.sub(r'\^.', '', tokens[4]).upper() bases = re.sub(r'[$<>]', '', bases) # Parse the pileup string for indels indel_tokens = indel_rx.findall(bases) bases = ''.join([ m[1][int(m[0][2:]):] if m[0] else m[1] for m in indel_tokens ]) indels = [ m[0][:2] + m[1][:int(m[0][2:])] for m in indel_tokens if m[0] ] bases = ''.join(sorted(bases)) if bases: bases += ' ' print('%s\t%s%s' % (sample_name, bases, ' '.join(indels))) dev_null.close()
def swiss_link(tsv_path): for line in open(tsv_path, 'U'): line = line.replace('\n', '') tokens = line.split('\t') if len(tokens) != 2: continue (source, dest) = tokens if not os.path.exists(source): info('Source file %s does not exist.' % source) continue if os.path.lexists(dest): info('Destination file %s exists. Will not overwrite.' % dest) continue os.symlink(source, dest)
def sam_reads_raw(bam_path, out_prefix): out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w') out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w') out = zopen('%s.reads.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read only has one primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') line = line[:-1] if line.endswith('/1'): segname = line[1:-2] mate = reads_2.pop(segname, None) if mate: out_1.write(next(bam2fq)) out_2.write('%s\n' % mate) else: reads_1[segname] = next(bam2fq)[:-1] elif line.endswith('/2'): segname = line[1:-2] mate = reads_1.pop(segname, None) if mate: out_1.write('%s\n' % mate) out_2.write(next(bam2fq)) else: reads_2[segname] = next(bam2fq)[:-1] else: out.write('%s\n' % next(bam2fq)[:-1]) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for read in reads_1.itervalues(): out.write('%s\n' % read) if len(reads_2) > 0: for read in reads_2.itervalues(): out.write('%s\n' % read) out_1.close() out_2.close() out.close()
def samples_by_patient(samples_path): samples = [line.strip() for line in zopen(samples_path)] patients = {} num_without_pid = 0 for s in samples: m = re.search('TCGA-..-....', s) if not m: num_without_pid += 1 continue psamples = patients.setdefault(m.group(0), []) psamples.append(s) if num_without_pid: info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid) for patient, psamples in patients.iteritems(): print('Patient %s (%d samples):' % (patient, len(psamples))) for sample in psamples: print('- %s' % sample)
def smallrna_expression(read_paths, srna_reference_path): S = len(read_paths) min_other_reads = 100 # Read the FASTA file containing small RNA reference sequences and # construct a new FASTA file that includes potential isoforms and # variants of these small RNA sequences. info('Constructing database of reference small RNA sequences...') seq_names = defaultdict(lambda: '') counts = defaultdict(lambda: [0] * S) for name, seq in read_fasta(srna_reference_path).iteritems(): name = re.sub(' MIMAT.*', '', name) seq = seq.upper().replace('U', 'T') seq_names[seq] = name seq_names[seq[:-1]] = name + '-1' seq_names[seq + 'A'] = name + '+A' seq_names[seq + 'C'] = name + '+C' seq_names[seq + 'G'] = name + '+G' seq_names[seq + 'T'] = name + '+T' info('Counting reads aligning to small RNA sequences...') for s, read_path in enumerate(read_paths): fasta = zopen(read_path) for line in fasta: if not line or line[0] == '#': continue if line[0] in '>@': seq = next(fasta)[:-1] counts[seq][s] += 1 counts = { seq: count for seq, count in counts.iteritems() if seq in seq_names or sum(count >= min_other_reads) >= 2 } print('NAME\tSEQUENCE\t%s' % '\t'.join(read_paths)) for seq in sorted(counts.iterkeys(), key=lambda x: seq_names[x]): sys.stdout.write('%s\t%s' % (seq_names[seq], seq)) for x in counts[seq]: sys.stdout.write('\t%d' % x) sys.stdout.write('\n')
def cghub_download(samples): for sample in samples: # Don't redownload files that are already present. existing = {} for root, dirnames, filenames in os.walk('.'): for f in filenames: path = os.path.join(root, f) existing[f] = os.stat(path).st_size filename = sample.files[0] filesize = sample.filesizes[0] if filename in existing and existing[filename] == filesize: info('%s has already been downloaded...' % filename) continue info('Downloading %s...' % filename) shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' % sample.analysis_data_uri)
def coverage_cds(bam_path, gtf_path): chr_sizes = ref_sequence_sizes(bam_path) info('Constructing a map of coding regions...') coding = {} for chr, size in chr_sizes.iteritems(): coding[chr] = [False] * size for line in zopen(gtf_path): if line.startswith('#'): continue cols = line.split('\t') if cols[2] != 'CDS': continue if len(cols[0]) > 5: continue # Ignore chromosomes other than chrXX if not cols[0] in coding: continue coding[cols[0]][int(cols[3])-1:int(cols[4])] = True info('Calculating a coverage histogram...') coverage_hist = [0] * 200 chr = '' pos = 0 for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path): cols = line.split('\t') if cols[0] != chr: chr = cols[0] cds = coding[chr] pos = int(cols[1])-2 info('%s...' % chr) pos += 1 if cds[pos]: coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1 print('Coverage histogram:') print('===================') for cov in range(0, len(coverage_hist)): print('%d: %d' % (cov, coverage_hist[cov]))
def sam_reads(bam_path, out_prefix): fastq_1 = zopen('%s_1.fq.gz' % out_prefix, 'w') fastq_2 = zopen('%s_2.fq.gz' % out_prefix, 'w') fastq = zopen('%s.fq.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # FIXME: We assume that each read only has one alignment in the BAM file. for al in read_sam(bam_path): flags = int(al[1]) if flags & 0x40: rname = al[0][:-2] if al[0].endswith('/1') else al[0] mate = reads_2.pop(rname, None) if mate: fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, al[9], al[10])) fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, mate[0], mate[1])) else: reads_1[rname] = (al[9], al[10]) elif flags & 0x80: rname = al[0][:-2] if al[0].endswith('/2') else al[0] mate = reads_1.pop(rname, None) if mate: fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, mate[0], mate[1])) fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, al[9], al[10])) else: reads_2[rname] = (al[9], al[10]) else: fastq.write('@%s\n%s\n+\n%s\n' % (al[0], al[9], al[10])) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for rname, read in reads_1.iteritems(): fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1])) if len(reads_2) > 0: for rname, read in reads_2.iteritems(): fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1])) fastq_1.close() fastq_2.close() fastq.close()
def somatic(vcf_path, sample_pairs): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 samples = headers[sample_col:] # Convert sample pair names into index 2-tuples. sample_pairs = [pair.split(',') for pair in sample_pairs] if not all(len(pair) == 2 for pair in sample_pairs): info([pair for pair in sample_pairs if len(pair) != 2]) error('Test and control samples must be in "test,control" format.') for pair in sample_pairs: if not pair[0] in samples: error('Test sample %s was not found in VCF file.' % pair[0]) if not pair[1] in samples: error('Control sample %s was not found in VCF file.' % pair[1]) sample_pairs = [(samples.index(pair[0]), samples.index(pair[1])) for pair in sample_pairs] sys.stdout.write(line) for line in vcf_file: cols = line.rstrip('\n').split('\t') gt_cols = cols[sample_col:] genotypes = [gt_symbols.index(g[:g.find(':')]) for g in gt_cols] somatic = [ genotypes[pair[0]] >= 2 and genotypes[pair[1]] == 1 for pair in sample_pairs ] if not any(somatic): continue sys.stdout.write(line)
def variant_merge(vcf_paths): sort_in, sort_out = shell_stdinout('sort -k2,2 -k3,3n -k4,4 -k5,5') cons_headers = [] # Consensus headers vcf_samples = [] # Sample names of each VCF for vcf_index, vcf_path in enumerate(vcf_paths): info('Merging VCF file %s...' % vcf_path) vcf = zopen(vcf_path) for line in vcf: if not line.startswith('#'): break headers = line.rstrip('\n').split('\t') gtype_col = (4 if not 'ESP6500' in headers else headers.index('ESP6500') + 1) if not cons_headers: cons_headers = headers[:gtype_col] if cons_headers != headers[:gtype_col]: error('Header mismatch!') vcf_samples.append(headers[gtype_col:]) for line in vcf: sort_in.write('%d\t%s' % (vcf_index, line)) sort_in.close() print('\t'.join(cons_headers + sum(vcf_samples, []))) vcf_sample_counts = [len(samples) for samples in vcf_samples] S = sum(vcf_sample_counts) vcf_sample_col = [ sum(vcf_sample_counts[0:k]) for k in range(len(vcf_samples)) ] info('Merged VCF will contain:') info('- %d header columns' % len(cons_headers)) for samples, path in zip(vcf_samples, vcf_paths): info('- %d columns from %s' % (len(samples), path)) prev = None calls = [':0:0'] * S for line in sort_out: cols = line.rstrip('\n').split('\t') vcf_index = int(cols[0]) call_col = vcf_sample_col[vcf_index] if prev != cols[1:5]: if prev != None: print('\t'.join(prev + calls)) prev = cols[1:gtype_col + 1] calls = [':0:0'] * S calls[call_col:call_col+vcf_sample_counts[vcf_index]] = \ cols[gtype_col+1:] print('\t'.join(prev + calls)) # Handle the last line
def ensembl_gene_bed(gtf_path): gene_id_to_name = {} gene_exons = {} gtf_file = zopen(gtf_path) for line in gtf_file: if line.startswith('#'): continue c = line.rstrip('\n').split('\t') if not c[0] in human_chr: continue if not c[1] in accepted_gene_types: continue if c[2] != 'exon': continue chr, start, end, strand = c[0], int(c[3]), int(c[4]), c[6] if not chr.startswith('chr'): chr = 'chr' + chr m = re.search(r'gene_id "(.+?)"', line) gene_id = m.group(1) m = re.search(r'gene_name "(.+?)"', line) gene_name = m.group(1) exons = gene_exons.setdefault(gene_id, []) exons.append((chr, strand, start, end)) gene_id_to_name[gene_id] = gene_name for gene_id, exons in gene_exons.iteritems(): if not all(exon[0] == exons[0][0] for exon in exons): info('Chromosome confusion detected.') if not all(exon[1] == exons[0][1] for exon in exons): info('Strand confusion detected.') start, end = min(ex[2] for ex in exons), max(ex[3] for ex in exons) print('%s\t%d\t%d\t%s (%s)\t\t%s' % (exons[0][0], start - 1, end, gene_id_to_name[gene_id], gene_id, exons[0][1]))
def fasta_repair(fasta_1_path, fasta_2_path, out_1_path, out_2_path): fasta_1 = zopen(fasta_1_path) fasta_2 = zopen(fasta_2_path) out_1 = zopen(out_1_path, 'w') out_2 = zopen(out_2_path, 'w') orphans_1 = {} orphans_2 = {} while not (fasta_1 == None and fasta_2 == None): if fasta_1: while 1: line = fasta_1.readline() if line == '': fasta_1.close() fasta_1 = None break if not line[0] in '>@': continue header = line[:-1].replace('/1', '') seq = fasta_1.readline()[:-1] qual = None if header[0] == '@': while line[0] != '+': line = fasta_1.readline() qual = fasta_1.readline()[:-1] # Check that there are as many quality values as nucleotides. if len(seq) != len(qual): info('Read %s/1 discarded due to corrupted qualities.' % header[:-1]) break read = orphans_2.get(header) if read: del orphans_2[header] if qual and read[1]: out_1.write('%s/1\n%s\n+\n%s\n' % (header, seq, qual)) out_2.write('%s/2\n%s\n+\n%s\n' % (header, read[0], read[1])) else: out_1.write('%s/1\n%s\n' % (header, seq)) out_2.write('%s/2\n%s\n' % (header, read[0])) else: orphans_1[header] = (seq, qual) break if fasta_2: while 1: line = fasta_2.readline() if line == '': fasta_2.close() fasta_2 = None break if not line[0] in '>@': continue header = line[:-1].replace('/2', '') seq = fasta_2.readline()[:-1] qual = None if header[0] == '@': while line[0] != '+': line = fasta_2.readline() qual = fasta_2.readline()[:-1] # Check that there are as many quality values as nucleotides. if len(seq) != len(qual): info('Read %s/2 discarded due to corrupted qualities.' % header[:-1]) break read = orphans_1.get(header) if read: del orphans_1[header] if qual and read[1]: out_1.write('%s/1\n%s\n+\n%s\n' % (header, read[0], read[1])) out_2.write('%s/2\n%s\n+\n%s\n' % (header, seq, qual)) else: out_1.write('%s/1\n%s\n' % (header, read[0])) out_2.write('%s/2\n%s\n' % (header, seq)) else: orphans_2[header] = (seq, qual) break out_1.close() out_2.close()
def fasta_check(fasta_1_path, fasta_2_path, out_1_path, out_2_path): fasta_1 = zopen(fasta_1_path) fasta_2 = zopen(fasta_2_path) out_1 = zopen(out_1_path, 'w') out_2 = zopen(out_2_path, 'w') bad_out_1 = zopen('bad.' + out_1_path, 'w') bad_out_2 = zopen('bad.' + out_2_path, 'w') while 1: discard = False if fasta_1: while 1: line = fasta_1.readline() if line == '': fasta_1.close() fasta_1 = None break if not line[0] in '>@': continue header_1 = line[:-1].replace('/1', '') seq_1 = fasta_1.readline()[:-1] qual_1 = None if header_1[0] == '@': while line[0] != '+': line = fasta_1.readline() qual_1 = fasta_1.readline()[:-1] if len(seq_1) != len(qual_1): discard = True break if fasta_2: while 1: line = fasta_2.readline() if line == '': fasta_2.close() fasta_2 = None break if not line[0] in '>@': continue header_2 = line[:-1].replace('/2', '') seq_2 = fasta_2.readline()[:-1] if header_2[0] == '@': while line[0] != '+': line = fasta_2.readline() qual_2 = fasta_2.readline()[:-1] if len(seq_2) != len(qual_2): discard = True break if fasta_1 == None and fasta_2 == None: break if (fasta_1 == None) ^ (fasta_2 == None): info('File terminated abruptly.') break if header_1 != header_2: discard = True if discard: if qual_1 and qual_2: bad_out_1.write('%s/1\n%s\n+\n%s\n' % (header_1, seq_1, qual_1)) bad_out_2.write('%s/2\n%s\n+\n%s\n' % (header_2, seq_2, qual_2)) else: bad_out_1.write('%s/1\n%s\n' % (header_1, seq_1)) bad_out_2.write('%s/2\n%s\n' % (header_2, seq_2)) else: if qual_1 and qual_2: out_1.write('%s/1\n%s\n+\n%s\n' % (header_1, seq_1, qual_1)) out_2.write('%s/2\n%s\n+\n%s\n' % (header_2, seq_2, qual_2)) else: out_1.write('%s/1\n%s\n' % (header_1, seq_1)) out_2.write('%s/2\n%s\n' % (header_2, seq_2)) out_1.close() out_2.close() bad_out_1.close() bad_out_2.close()
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options): #print(bam_paths, genome_path, options.region, options.homz) gt_symbols = ['', '0/0', '0/1', '1/1'] if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) print('CHROM\tPOSITION\tREF\tALT\t%s' %samples[0]) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): if type(line) == bytes: line = line.decode('utf8') tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. # for alt, reads in allele_reads.iteritems(): for alt, reads in allele_reads.items(): genotypes = call_genotypes(reads, total_reads, options) # if not options.keep_all and all(gt < 2 for gt in genotypes): continue # if all(gt != 2 for gt in genotypes): continue if genotypes[1] != 2: continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) ####################### ## Hetrozygous bases ## ####################### gt_list = list(gtypes) gt_col = gt_list[1] ## genotype for the normal sample genotype = gt_symbols.index(gt_col[:gt_col.find(':')]) total_read = float(gt_col.split(':')[2]) if not (genotype == 2 and total_read >= 15): continue ######################### ## calculating the BAF ## ######################### read = gt_list[0].split(':')[1:3] ## reads for the tumor sample sys.stdout.write('\t'.join([tokens[0], tokens[1], ref, alt.upper()])) alt, total = float(read[0]), int(read[1]) sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total)) sys.stdout.write('\n')
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options): #print(bam_paths, genome_path, options.region, options.homz) gt_symbols = ['', '0/0', '0/1', '1/1'] if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) print('CHROM\tPOSITION\tREF\tALT\t%s' % samples[0]) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): if type(line) == bytes: line = line.decode('utf8') tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. # for alt, reads in allele_reads.iteritems(): for alt, reads in allele_reads.items(): genotypes = call_genotypes(reads, total_reads, options) # if not options.keep_all and all(gt < 2 for gt in genotypes): continue # if all(gt != 2 for gt in genotypes): continue if genotypes[1] != 2: continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) ####################### ## Hetrozygous bases ## ####################### gt_list = list(gtypes) gt_col = gt_list[1] ## genotype for the normal sample genotype = gt_symbols.index(gt_col[:gt_col.find(':')]) total_read = float(gt_col.split(':')[2]) if not (genotype == 2 and total_read >= 15): continue ######################### ## calculating the BAF ## ######################### read = gt_list[0].split(':')[1:3] ## reads for the tumor sample sys.stdout.write('\t'.join( [tokens[0], tokens[1], ref, alt.upper()])) alt, total = float(read[0]), int(read[1]) sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total)) sys.stdout.write('\n')
def variant_call(bam_paths, genome_path, options): if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. for alt, reads in allele_reads.iteritems(): genotypes = call_genotypes(reads, total_reads, options) if not options.keep_all and all(gt < 2 for gt in genotypes): continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) print('%s\t%s\t%s\t%s\t%s' % (tokens[0], tokens[1], ref, alt.upper(), '\t'.join(gtypes)))
def detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len): out = zopen(out_prefix + '.discordant_reads.tsv.gz', 'w') N = 0 info('Splitting unaligned reads into %d bp anchors and aligning against ' 'the genome...' % anchor_len) # IMPORTANT: Only one thread can be used, otherwise alignment order is not # guaranteed and the loop below will fail. anchor_alignments = shell_stdout( 'samtools fasta -f 0x4 %s | fasta split interleaved - %d | ' 'bowtie -f -p1 -v0 -m1 -B1 --suppress 5,6,7,8 %s -' % (sam_path, anchor_len, genome_path)) chromosomes = read_flat_seq(genome_path) for chr in list(chromosomes.keys()): if not chr.startswith('chr'): chromosomes['chr' + chr] = chromosomes.pop(chr) prev = [''] for line in anchor_alignments: al = line.split('\t') if al[0][-2] == '/': al[0] = al[0][:-2] if al[0] != prev[0]: prev = al continue chr = prev[2] mchr = al[2] strand = prev[1] mstrand = al[1] pos = int(prev[3]) mpos = int(al[3]) seq = prev[0][prev[0].find('_') + 1:] full_len = len(seq) if not chr.startswith('chr'): chr = 'chr' + chr if not mchr.startswith('chr'): mchr = 'chr' + mchr # Ignore anchor pairs where the anchors are too close. if chr == mchr and abs(pos - mpos) < full_len - anchor_len + 10: continue # Ignore rearrangements involving mitochondrial DNA. if 'M' in chr or 'M' in mchr: continue # Reorient the pairs so the first anchor is always upstream. # If mates are swapped, both mates must be reverse-complemented. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos strand, mstrand = '+' if mstrand == '-' else '-', \ '+' if strand == '-' else '-' seq = revcomplement(seq) # Extract the flanking sequences from the chromosome sequences. # The range calculations are a bit complex. It's easier to understand # them if you first add one to all indices to convert to 1-based # genomic coordinates ("pos" and "mpos" are 1-based). if strand == '+': left_grch = chromosomes[chr][pos - 1:pos + full_len - 1] else: left_grch = revcomplement( chromosomes[chr][pos + anchor_len - full_len - 1:pos + anchor_len - 1]) if mstrand == '+': right_grch = chromosomes[mchr][mpos + anchor_len - full_len - 1:mpos + anchor_len - 1] else: right_grch = revcomplement(chromosomes[mchr][mpos - 1:mpos + full_len - 1]) # If the read is at the very edge of a chromosome, ignore it. if len(left_grch) < full_len or len(right_grch) < full_len: continue # Make sure that reference sequences are in uppercase left_grch = left_grch.upper() right_grch = right_grch.upper() #print('-------------------') #print([chr, strand, pos, mchr, mstrand, mpos]) #print(seq) #print(left_grch) #print(right_grch) # Check that the read sequence is not too homologous on either side # of the breakpoint. left_match = float( sum([ seq[i] == left_grch[i] for i in range(full_len - anchor_len, full_len) ])) / anchor_len right_match = float( sum([seq[i] == right_grch[i] for i in range(anchor_len)])) / anchor_len max_homology = 0.7 if left_match >= max_homology or right_match >= max_homology: continue # Identify the breakpoint location that minimizes the number of # nucleotide mismatches between the read and the breakpoint flanks. potential_breakpoints = range(anchor_len, full_len - anchor_len + 1) mismatches = [0] * len(potential_breakpoints) for k, br in enumerate(potential_breakpoints): grch_chimera = left_grch[:br] + right_grch[br:] mismatches[k] = sum( [seq[i] != grch_chimera[i] for i in range(full_len)]) # The best breakpoint placement cannot have more than N mismatches. least_mismatches = min(mismatches) #if least_mismatches > 2: continue # "br" represent the number of nucleotides in the read # before the breakpoint, counting from the 5' end of the read. # If there is microhomology, we pick the first breakpoint. br = potential_breakpoints[mismatches.index(least_mismatches)] # Now that we know the exact fusion breakpoint, we mark mismatches # with a lower case nucleotide and augment the read # sequence with a | symbol to denote the junction. grch_chimera = left_grch[:br] + right_grch[br:] seq = ''.join([ nuc if grch_chimera[k] == nuc else nuc.lower() for k, nuc in enumerate(seq) ]) seq = seq[:br] + '|' + seq[br:] # Make positions represent read starts. if strand == '-': pos += anchor_len - 1 if mstrand == '-': mpos += anchor_len - 1 # Each discordant anchor pair is represented as a 7-tuple # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, sequence). # Positions are 1-based and represent read starts. out.write('%s\t%s\t%d\t%s\t%s\t%d\t%s\n' % (chr, strand, pos, mchr, mstrand, mpos, seq)) N += 1 info('Found %d discordant anchor pairs.' % N) out.close()
def detect_rearrangements(sam_path, genome_path, out_prefix, anchor_len, min_mapq, orientation, max_frag_len, discard_duplicates='both-ends'): if not os.path.exists(sam_path): error('File %s does not exist.' % sam_path) if not discard_duplicates in ('no', 'both-ends', 'one-end'): error('Invalid duplicate discard method: %s' % discard_duplicates) detect_discordant_pairs(sam_path, out_prefix, max_frag_len=max_frag_len, min_mapq=min_mapq, orientation=orientation) # Execute split read analysis if the user has specified an anchor length. if anchor_len > 0: detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len) info('Sorting discordant pairs by chromosomal position...') sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix if anchor_len > 0: sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' % (sort_tmp_dir, sort_inputs, out_prefix)) def report_rearrangement(out, r): if discard_duplicates == 'both-ends': discard_duplicates_both_ends(r) elif discard_duplicates == 'one-end': discard_duplicates_one_end(r) if len(r.reads) < 2: return 0 out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' % (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos, sum([read[2] == None for read in r.reads]), sum([read[2] != None for read in r.reads]), ';'.join( [read[2] for read in r.reads if read[2] != None]))) return 1 info('Identifying rearrangements based on clusters of discordant reads...') out = open('%s.sv' % out_prefix, 'w') out.write(sv_file_header + '\n') N = 0 rearrangements = [] for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix): al = line[:-1].split('\t') chr = al[0] strand = al[1] pos = int(al[2]) mchr = al[3] mstrand = al[4] mpos = int(al[5]) seq = None if al[6] == '-' else al[6] # Rearrangements that are too far need not be considered in the future reachable = [] for r in rearrangements: if pos - r.pos > max_frag_len: N += report_rearrangement(out, r) else: reachable.append(r) rearrangements = reachable # Check if we already have a rearrangement that matches the new pair. # We don't check the distance for the first mate because we already # know from above the rearrangements near it. matches = [ r for r in rearrangements if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand ] read = (pos, mpos, seq) if matches: for match in matches: match.reads.append(read) else: # No suitable rearrangements, create a new one. rearrangements.append( Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read)) for r in rearrangements: N += report_rearrangement(out, r) info('Found %d rearrangements with at least 2 reads of evidence.' % N)
def detect_specific(bam_path, donors_path, acceptors_path, genome_path, out_prefix, all_reads): read_len = sam.read_length(bam_path) info('Using read length %d bp...' % read_len) flank_len = read_len - 10 chromosomes = read_fasta(genome_path) donor_exons = regions_from_bed(donors_path) donors = [] for ex in donor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]])) elif ex[1] == '-': donors.append( (chr, '-', ex[2], revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))) acceptor_exons = regions_from_bed(acceptors_path) acceptors = [] for ex in acceptor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': acceptors.append( (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])) elif ex[1] == '-': acceptors.append((chr, '-', ex[3], revcomplement(chr_seq[ex[3] - flank_len:ex[3]]))) del chromosomes # Release 3 GB of memory gc.collect() # Remove duplicate acceptors and donors. acceptors = list(set(acceptors)) donors = list(set(donors)) # Calculate junction sequences junctions = {} for left in donors: for right in acceptors: name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3]) junctions[name] = Object(sequence=left[3] + right[3], reads=[]) info('Generated %d junctions.' % len(junctions)) # Build Bowtie index info('Constructing junction FASTA file...') index_fasta_path = out_prefix + '_ref.fa' index = open(index_fasta_path, 'w') for name, junction in junctions.iteritems(): index.write('>%s\n%s\n' % (name, junction.sequence)) index.close() info('Constructing Bowtie index...') shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix)) # Align reads against junctions and tally junction read counts. if all_reads: info('Aligning all reads against index...') reads_command = 'sam reads %s' % bam_path else: info('Aligning unaligned reads against index...') reads_command = 'sam unaligned reads %s' % bam_path for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' % (out_prefix, reads_command)): cols = line.rstrip().split('\t') junctions[cols[2]].reads.append(cols[4]) shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix)) out_file = open(out_prefix + '.tsv', 'w') out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n') for name, j in junctions.iteritems(): if not j.reads: continue flanks = name.split('_') out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads))) #out_file.write(';'.join(j.reads)) out_file.write('\n') out_file.close()
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq, orientation): out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w') N = 0 sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' # Go through all the first mates and look for discordant pairs. info('Searching for discordant read pairs...') prev = [''] for line in shell_stdout( 'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' % (min_mapq, sam_path, max_frag_len, sort_tmp_dir)): al = line.split('\t') if len(al) < 9: continue # Discard spliced and clipped reads. # FIXME: Add support for spliced RNA-seq reads. if 'N' in al[5] or 'S' in al[5]: continue if al[0].endswith('/1') or al[0].endswith('/2'): al[0] = al[0][:-2] # Remove /1 or /2 suffix if al[0] != prev[0]: prev = al continue flags = int(al[1]) chr = al[2] mchr = prev[2] strand = '-' if flags & 0x10 else '+' mstrand = '-' if flags & 0x20 else '+' pos = int(al[3]) mpos = int(prev[3]) rlen = len(al[9]) mrlen = len(prev[9]) if not chr.startswith('chr'): chr = 'chr' + chr if not mchr.startswith('chr'): mchr = 'chr' + mchr if chr == 'chrM' or mchr == 'chrM': continue # Discard mitochondrial if orientation == 'fr': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip second mate). mstrand = '-' if mstrand == '+' else '+' elif orientation == 'rf': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip first mate). strand = '-' if strand == '+' else '+' elif orientation == 'ff': # Reorient pairs so that the first mate is always upstream. # If mates are swapped, both mates must be reversed. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = '+' if mstrand == '-' else '-', \ '+' if strand == '-' else '-' else: error('Unsupported read orientation detected.') # Make positions represent read starts. if strand == '-': pos += rlen - 1 if mstrand == '-': mpos += mrlen - 1 # Each discordant mate pair is represented as a 7-tuple # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None). # The None at the end signifies that this is a mate pair. # Positions are 1-based and represent read starts. out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' % (chr, strand, pos, mchr, mstrand, mpos)) N += 1 out.close() info('Found %d discordant mate pairs.' % N)