Пример #1
0
def variant_filter(vcf_path, nonsynonymous, no_1000g):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break
    sys.stdout.write(line)

    headers = line[:-1].split('\t')

    if nonsynonymous and not 'EXONIC_FUNCTION' in headers:
        error('Cannot find exonic function column.')
    if no_1000g and not '1000G' in headers:
        error('Cannot find 1000 Genomes column.')

    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    col_1000g = headers.index('1000G')
    col_exonic_func = headers.index('EXONIC_FUNCTION')

    for line in vcf_file:
        cols = line[:-1].split('\t')

        if nonsynonymous:
            if not cols[col_exonic_func].startswith(
                ('nonsynonymous', 'frameshift', 'stopgain', 'stoploss',
                 'nonframeshift')):
                continue

        if no_1000g:
            if cols[col_1000g]: continue

        sys.stdout.write(line)
Пример #2
0
def swiss_igv(tsv_path, data_col, one_based=True):
    tsv_file = zopen(tsv_path)
    headers = next(tsv_file)[:-1].split('\t')

    chrom_col = [
        i for i, h in enumerate(headers[:data_col])
        if re.match('chrom', h, re.I)
    ]
    if len(chrom_col) != 1: error('Cannot find chromosome column.')
    chrom_col = chrom_col[0]

    pos_col = [
        i for i, h in enumerate(headers[:data_col])
        if re.match('pos', h, re.I)
    ]
    if len(pos_col) != 1: error('Cannot find position column.')
    pos_col = pos_col[0]

    print('CHROMOSOME\tSTART\tEND\tFEATURE\t' + '\t'.join(headers[data_col:]))
    for line in tsv_file:
        tokens = line[:-1].split('\t')
        chr = tokens[chrom_col]
        pos = int(tokens[pos_col])
        if one_based: pos -= 1
        sys.stdout.write('%s\t%d\t%d\t-\t' % (chr, pos, pos + 1))
        print('\t'.join(tokens[data_col:]))
Пример #3
0
def discard_if_in_controls(vcf_path, control_samples, threshold):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    control = [
        any(re.search(rx, s) for rx in control_samples)
        for s in headers[sample_col:]
    ]
    if not any(control): error('No control samples found.')

    info('Using these %d control samples:' % sum(control))
    for s, c in zip(headers[sample_col:], control):
        if c: info('- %s' % s)

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')[sample_col:]
        genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols]
        if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold:
            continue
        sys.stdout.write(line)
Пример #4
0
def fasta_remove_adapters(fasta_path, adapter):
    # Convert the adapter into a regular expression
    if len(adapter) < 5: error('Adapter sequence is too short.')
    adapter_re = adapter[:5]
    for base in adapter[5:]:
        adapter_re += '(?:' + base
    adapter_re += (len(adapter) - 5) * ')?'
    adapter_re = re.compile(adapter_re)

    info('Adapter regular expression: %s' % adapter_re)

    fasta = zopen(fasta_path)
    for line in fasta:
        if line[0] == '#':
            sys.stdout.write(line)
        elif line[0] == '>':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            if m: seq = seq[:m.start()]
            print(seq)
        elif line[0] == '@':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            trim_len = m.start() if m else len(seq)
            print(seq[:trim_len])
            sys.stdout.write(next(fasta))
            print(next(fasta)[:trim_len])
Пример #5
0
def swiss_download_sra(sra_study):
    if not sra_study.startswith('SRP'):
        error('SRA study identifier must begin with "SRP".')

    shell('/data/csb/tools/ncftp-3.2.5/bin/ncftpget -R -v '
          'ftp-trace.ncbi.nlm.nih.gov ./ '
          '/sra/sra-instant/reads/ByStudy/sra/SRP/%s/%s' %
          (sra_study[:6], sra_study))
Пример #6
0
def sam_reads_raw(bam_path, out_prefix):
    out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w')
    out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w')
    out = zopen('%s.reads.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read only has one primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path))
    for line in bam2fq:
        if line[0] != '@': error('Invalid bam2fq output.')
        line = line[:-1]
        if line.endswith('/1'):
            segname = line[1:-2]
            mate = reads_2.pop(segname, None)
            if mate:
                out_1.write(next(bam2fq))
                out_2.write('%s\n' % mate)
            else:
                reads_1[segname] = next(bam2fq)[:-1]
        elif line.endswith('/2'):
            segname = line[1:-2]
            mate = reads_1.pop(segname, None)
            if mate:
                out_1.write('%s\n' % mate)
                out_2.write(next(bam2fq))
            else:
                reads_2[segname] = next(bam2fq)[:-1]
        else:
            out.write('%s\n' % next(bam2fq)[:-1])

        # Skip per-base qualities. They can start with '@'.
        next(bam2fq)
        next(bam2fq)

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for read in reads_1.itervalues():
            out.write('%s\n' % read)

    if len(reads_2) > 0:
        for read in reads_2.itervalues():
            out.write('%s\n' % read)

    out_1.close()
    out_2.close()
    out.close()
Пример #7
0
def read_length(bam_path):
    read_lens = []
    for al in read_sam(bam_path, 'a'):
        if len(read_lens) >= 100: break
        read_lens.append(len(al[9]))

    if len(set(read_lens)) > 1:
        error('SAM file contains reads of varying length.')
    else:
        return read_lens[0]
Пример #8
0
def filter_by_region(sv_path, region):
    m = re.match(r'(chr.+): *(\d+) *- *(\d+)', region.strip())
    if not m: error('Invalid region specified.')

    chr = m.group(1)
    start = int(m.group(2))
    end = int(m.group(3))

    for line in zopen(sv_path):
        if not line.startswith('chr'):
            sys.stdout.write(line)
            continue
        c = line.rstrip().split('\t')

        if not chr in (c[0], c[5]): continue
        if (start <= int(c[2]) <= end) or (start <= int(c[7]) <= end):
            sys.stdout.write(line)
Пример #9
0
def variant_merge(vcf_paths):
    sort_in, sort_out = shell_stdinout('sort -k2,2 -k3,3n -k4,4 -k5,5')
    cons_headers = []  # Consensus headers
    vcf_samples = []  # Sample names of each VCF
    for vcf_index, vcf_path in enumerate(vcf_paths):
        info('Merging VCF file %s...' % vcf_path)
        vcf = zopen(vcf_path)
        for line in vcf:
            if not line.startswith('#'): break
        headers = line.rstrip('\n').split('\t')
        gtype_col = (4 if not 'ESP6500' in headers else
                     headers.index('ESP6500') + 1)
        if not cons_headers: cons_headers = headers[:gtype_col]
        if cons_headers != headers[:gtype_col]: error('Header mismatch!')
        vcf_samples.append(headers[gtype_col:])
        for line in vcf:
            sort_in.write('%d\t%s' % (vcf_index, line))
    sort_in.close()

    print('\t'.join(cons_headers + sum(vcf_samples, [])))
    vcf_sample_counts = [len(samples) for samples in vcf_samples]
    S = sum(vcf_sample_counts)
    vcf_sample_col = [
        sum(vcf_sample_counts[0:k]) for k in range(len(vcf_samples))
    ]

    info('Merged VCF will contain:')
    info('- %d header columns' % len(cons_headers))
    for samples, path in zip(vcf_samples, vcf_paths):
        info('- %d columns from %s' % (len(samples), path))

    prev = None
    calls = [':0:0'] * S
    for line in sort_out:
        cols = line.rstrip('\n').split('\t')
        vcf_index = int(cols[0])
        call_col = vcf_sample_col[vcf_index]
        if prev != cols[1:5]:
            if prev != None:
                print('\t'.join(prev + calls))
            prev = cols[1:gtype_col + 1]
            calls = [':0:0'] * S
        calls[call_col:call_col+vcf_sample_counts[vcf_index]] = \
         cols[gtype_col+1:]

    print('\t'.join(prev + calls))  # Handle the last line
Пример #10
0
def variant_signature(vcf_path, genome_path):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    chromosomes = read_fasta(genome_path)

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    samples = headers[sample_col:]

    substitutions = []
    for ref in 'CT':
        for alt in ('AGT' if ref == 'C' else 'ACG'):
            for pre in 'ACGT':
                for post in 'ACGT':
                    substitutions.append(pre + ref + post + '>' + pre + alt +
                                         post)
    sub_count = np.zeros((len(substitutions), len(samples)))

    for line in vcf_file:
        cols = line[:-1].split('\t')
        if not cols[2] in 'ACGT' or not cols[3] in 'ACGT': continue
        chr = chromosomes[cols[0]]
        pos = int(cols[1])
        if chr[pos - 1] != cols[2]: error('Reference mismatch!')
        ref = chr[pos - 2:pos + 1]
        alt = ref[0] + cols[3] + ref[2]
        if ref[1] in 'AG':
            ref = revcomplement(ref)
            alt = revcomplement(alt)

        for s, gt in enumerate(cols[sample_col:]):
            if gt_symbols.index(gt.split(':')[0]) > 1:
                sub_count[substitutions.index(ref + '>' + alt), s] += 1

    print('SUBSTITUTION\t%s' % '\t'.join(samples))
    for sub in substitutions:
        sys.stdout.write(sub)
        for count in sub_count[substitutions.index(sub), :]:
            sys.stdout.write('\t%d' % count)
        sys.stdout.write('\n')
Пример #11
0
def sam_unaligned_reads(bam_path):
    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read has max 1 primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    if has_base_qualities(bam_path):
        shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
              (bam_path, options))
    else:
        bam2fq = shell_stdout(
            'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
            (bam_path, options))
        for line in bam2fq:
            if line[0] != '@': error('Invalid bam2fq output.')
            sys.stdout.write('>')
            sys.stdout.write(line[1:])
            sys.stdout.write(next(bam2fq))

            # Skip per-base qualities. They can start with '@'.
            next(bam2fq)
            next(bam2fq)
Пример #12
0
def variant_heterozygous_concordance(vcf_path, kgenomes_path, test_rx, ref_rx):
    is_snp = np.zeros(300 * 1000 * 1000, np.bool_)
    for line in zopen(kgenomes_path):
        pos = int(line[:-1].split('\t')[1])
        is_snp[pos] = True

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line[:-1].split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    test_col = [
        i for i, h in headers if re.search(test_rx, h) and i >= sample_col
    ]
    ref_col = [
        i for i, h in headers if re.search(ref_rx, h) and i >= sample_col
    ]
    if len(test_col) != 1: error('Test sample not found.')
    if len(ref_col) != 1: error('Reference sample not found.')

    total_hetz_in_ref = 0
    total_concordant = 0
    for line in vcf_file:
        cols = line[:-1].split('\t')
        if not is_snp[int(cols[1])]: continue

        test = cols[test_col]
        test_gt = gt_symbols.index(test[:test.find(':')])
        ref = cols[ref_col]
        ref_gt = gt_symbols.index(ref[:ref.find(':')])

        if ref_gt == 2:
            total_hetz_in_ref += 1
            total_concordant += (test_gt == 2)

    print('Concordance was %.1f%% (%d / %d).' %
          (float(total_concordant) / total_hetz_in_ref * 100, total_concordant,
           total_hetz_in_ref))
Пример #13
0
def parallel(command, job_name, max_workers, cpus, memory, partition,
	time_limit):
	
	# Allow splitting the command string onto multiple lines.
	command = command.replace('\n', ' ')
	
	if sys.stdin.isatty():
		# If the user did not provide any input, just run the command once.
		# The command must not contain $x.
		if '$x' in command or '${x' in command:
			error('Command contains $x but no targets provided.')
		targets = ['']
	else:
		# Parse whitespace-delimited target items from standard input.
		targets = []
		for line in sys.stdin:
			targets += line.split(' ')
		targets = [t.replace('\n', '') for t in targets]
		
		if not targets: error('Command requires targets but none provided.')
	
	if len(set(targets)) < len(targets):
		error('Target list contains multiple instances of the following targets:\n' + '\n'.join(s for s in set(targets)
			if targets.count(s) > 1))

	if max_workers > len(targets): max_workers = len(targets)

	if partition != 'local':
		info('Distributing %d %s named "%s" on %s partition '
			'(with %d %s and %d GB of memory per job).' % (
			len(targets), 'jobs' if len(targets) != 1 else 'job',
			job_name, partition, cpus, 'CPUs' if cpus != 1 else 'CPU', memory))
	else:
		info('Starting %d %s named "%s" on local machine.' % (
			len(targets), 'jobs' if len(targets) != 1 else 'job', job_name))

	log_dir = os.path.expanduser('~/.jobs/%s_%s' % (job_name, 
		datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')))
	os.makedirs(log_dir)

	with open('%s/tasks' % log_dir, 'w') as f:
		f.write('%s\n' % command)
		for target in targets: f.write('%s\n' % target)

	if partition == 'local':
		worker_cmd = ['parallel', 'worker', log_dir]
		workers = [subprocess.Popen(worker_cmd) for w in range(max_workers)]
		for w in workers: w.wait()
	else:
		# Run the job steps on a SLURM cluster using sbatch.
		# Required memory is given in GB per job step. Convert to MB per CPU.
		mem_per_cpu = round(float(memory) / cpus * 1000)
		sbatch_script = sbatch_template % (partition, job_name, cpus,
			mem_per_cpu, 60 * time_limit, log_dir, log_dir, log_dir)
		workers = [subprocess.Popen(['sbatch', '-Q'], stdin=subprocess.PIPE)
			for p in range(max_workers)]
		for w in workers:
			w.stdin.write(sbatch_script.encode('utf-8'))
			w.stdin.close()
		for w in workers: w.wait()
Пример #14
0
def bed_composite(bed_path):
	features = {}
	for line in zopen(bed_path):
		if line.startswith('#'): continue
		c = line.rstrip('\n').split('\t')
		chr, start, end, name = c[0], int(c[1]), int(c[2]), c[3]

		feature = features.setdefault(name, [chr, [(start, end)]])
	 	if chr != feature[0]: error('Chromosome mismatch.')
		
	 	segments = feature[1]
	 	overlapping = [seg for seg in segments
	 		if end >= seg[0] and start <= seg[1]]
	 	disjoint = [seg for seg in segments
	 		if not (end >= seg[0] and start <= seg[1])]
	 	disjoint.append((min([start] + [seg[0] for seg in overlapping]),
	 		max([end] + [seg[1] for seg in overlapping])))
	 	feature[1] = disjoint

	for name, feature in features.iteritems():
	 	segments = feature[1]
	 	for seg in segments:
	 		print('%s\t%d\t%d\t%s' % (feature[0], seg[0], seg[1], name))
Пример #15
0
def filter_variants(sv_path, min_reads, blacklist_path=None):

    read_rules = [r.split('-') for r in min_reads]
    for k, r in enumerate(read_rules):
        if len(r) != 3:
            error('Invalid minimum read rule %s specified.' % min_reads[k])

    blacklist = set()
    if blacklist_path:
        blacklist = set([x.rstrip('\n') for x in open(blacklist_path)])

    sv_file = open(sv_path)
    sys.stdout.write(next(sv_file))  # Header
    for line in sv_file:
        tokens = line.rstrip('\n').split('\t')

        valid = [
            int(tokens[10]) >= int(rule[0]) and int(tokens[11]) >= int(rule[1])
            and int(tokens[10]) + int(tokens[11]) >= int(rule[2])
            for rule in read_rules
        ]
        if not any(valid): continue

        chrom = tokens[0]
        pos = int(tokens[2])
        loci_1 = set(sv_locus_identifiers(chrom, pos))

        chrom = tokens[5]
        pos = int(tokens[7])
        loci_2 = set(sv_locus_identifiers(chrom, pos))

        # We discard a rearrangement if *both* endpoints are located
        # in blacklisted regions.
        if loci_1.isdisjoint(blacklist) or loci_2.isdisjoint(blacklist):
            sys.stdout.write(line)

    sv_file.close()
Пример #16
0
def somatic(vcf_path, sample_pairs):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    samples = headers[sample_col:]

    # Convert sample pair names into index 2-tuples.
    sample_pairs = [pair.split(',') for pair in sample_pairs]
    if not all(len(pair) == 2 for pair in sample_pairs):
        info([pair for pair in sample_pairs if len(pair) != 2])
        error('Test and control samples must be in "test,control" format.')
    for pair in sample_pairs:
        if not pair[0] in samples:
            error('Test sample %s was not found in VCF file.' % pair[0])
        if not pair[1] in samples:
            error('Control sample %s was not found in VCF file.' % pair[1])
    sample_pairs = [(samples.index(pair[0]), samples.index(pair[1]))
                    for pair in sample_pairs]

    sys.stdout.write(line)

    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')
        gt_cols = cols[sample_col:]

        genotypes = [gt_symbols.index(g[:g.find(':')]) for g in gt_cols]

        somatic = [
            genotypes[pair[0]] >= 2 and genotypes[pair[1]] == 1
            for pair in sample_pairs
        ]
        if not any(somatic): continue

        sys.stdout.write(line)
Пример #17
0
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options):
    #print(bam_paths, genome_path, options.region, options.homz)
    gt_symbols = ['', '0/0', '0/1', '1/1']
    if not os.path.exists(genome_path):
        error('Could not find genome FASTA file %s.' % genome_path)

    if options.region:
        for bam_path in bam_paths:
            if not os.path.exists(bam_path + '.bai'):
                error('No index found for BAM file %s.' % bam_path)

    samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]

    # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))
    print('CHROM\tPOSITION\tREF\tALT\t%s' % samples[0])

    ignore_mapq = [False] * len(samples)
    if options.ignore_mapq:
        for s, sample in enumerate(samples):
            if re.search(options.ignore_mapq, sample) != None:
                ignore_mapq[s] = True
                info('Ignoring mapping quality for sample %s.' % sample)

    for line in simple_pileup(bam_paths,
                              genome_path,
                              kgenomes_path,
                              min_mapq=options.min_mapq,
                              min_alt_alleles=(0 if options.keep_all else
                                               options.min_hetz_reads),
                              region=options.region):

        if type(line) == bytes:
            line = line.decode('utf8')

        tokens = line[:-1].split('\t')
        if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
        if tokens[2] == 'N': continue
        pileups = [p.split(' ') for p in tokens[3:]]
        #total_reads = np.zeros(len(samples))
        #allele_reads = defaultdict(lambda: np.zeros(len(samples)))

        total_reads = [0] * len(samples)
        allele_reads = defaultdict(lambda: [0] * len(samples))

        for s, pileup in enumerate(pileups):
            if len(pileup) < 3: continue
            for a in range(0, len(pileup), 3):
                count = int(pileup[a+1]) + \
                 (int(pileup[a+2]) if ignore_mapq[s] else 0)
                total_reads[s] += count
                if pileup[a] != '.': allele_reads[pileup[a]][s] = count

        # Call genotypes for each allele.
        # for alt, reads in allele_reads.iteritems():
        for alt, reads in allele_reads.items():
            genotypes = call_genotypes(reads, total_reads, options)

            # if not options.keep_all and all(gt < 2 for gt in genotypes): continue
            # if all(gt != 2 for gt in genotypes): continue
            if genotypes[1] != 2: continue

            gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
                      for s, g in enumerate(genotypes))
            # Reformat indels in VCF4 format
            ref = tokens[2]
            if len(alt) >= 2:
                if alt[1] == '+':  # Insertion
                    alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
                elif alt[1] == '-':  # Deletion
                    ref += alt[2:]
                    alt = (ref[0] if alt[0] == '.' else alt[0])

            #######################
            ## Hetrozygous bases ##
            #######################

            gt_list = list(gtypes)
            gt_col = gt_list[1]  ## genotype for the normal sample
            genotype = gt_symbols.index(gt_col[:gt_col.find(':')])
            total_read = float(gt_col.split(':')[2])
            if not (genotype == 2 and total_read >= 15): continue

            #########################
            ## calculating the BAF ##
            #########################

            read = gt_list[0].split(':')[1:3]  ## reads for the tumor sample
            sys.stdout.write('\t'.join(
                [tokens[0], tokens[1], ref,
                 alt.upper()]))
            alt, total = float(read[0]), int(read[1])
            sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' %
                             (alt / total))
            sys.stdout.write('\n')
Пример #18
0
def visualize_splicing(genes, fastq_prefix, out_prefix):
	genome_path = '/data/csb/organisms/homo_sapiens/hg19_flat'
	bed_path = '/data/csb/organisms/homo_sapiens/ensembl_68/exons.bed'
	genes = genes.replace(' ', '').split(',')
	min_anchor = 15
	read_len = 90
	trim = read_len - min_anchor
	
	chromosomes = read_flat_seq('/data/csb/organisms/homo_sapiens/hg19_flat')
	
	donors = []
	acceptors = []
	exons = []
	for line in zopen(bed_path):
		cols = line[:-1].split('\t')
		if cols[3] in genes:
			chr = cols[0] if cols[0].startswith('chr') else 'chr'+cols[0]
			chr_seq = chromosomes[chr]
			pos = (int(cols[1])+1, int(cols[2]))
			if cols[5] == '+':
				acceptors.append((chr, '+', pos[0],
					chr_seq[pos[0]-1:pos[0]-1+trim]))
				donors.append((chr, '+', pos[1], chr_seq[pos[1]-trim:pos[1]]))
			elif cols[5] == '-':
				acceptors.append((chr, '-', pos[1],
					revcomplement(chr_seq[pos[1]-trim:pos[1]])))
				donors.append((chr, '-', pos[0],
					revcomplement(chr_seq[pos[0]-1:pos[0]-1+trim])))
			exons.append(pos)
				
	# Remove duplicate acceptors and donors.
	acceptors = list(set(acceptors))
	donors = list(set(donors))
	exons = list(set(exons))
	
	# Calculate the contiguous genomic sequence
	chr = acceptors[0][0]
	if any(a[0] != chr for a in acceptors):
		error('Genes must be in the same chromosome!')
	
	genome_window = (min(a[2] for a in acceptors)-2000,
		max(a[2] for a in acceptors)+2000)
	#contig = chromosomes[chr][genome_window[0]:genome_window[1]]
	
	# Calculate junction sequences
	class Junction(object):
		def __init__(self, name, seq):
			self.name = name
			self.sequence = seq
			self.reads = 0
			self.ratio = 0
	
	junctions = defaultdict(list)   # Group junctions by donor
	for left in donors:
		for right in acceptors:
			name = '%d[%s]_%d[%s]' % (left[2], left[1], right[2], right[1])
			junctions[left].append(Junction(name, left[3] + right[3]))
	print('Generated %d junctions.' % (len(donors) * len(acceptors)))
	
	# Build Bowtie index
	index_fasta_path = '%s_ref.fa' % out_prefix
	index = open(index_fasta_path, 'w')
	#index.write('>contig\n%s\n' % contig)
	for donor in junctions:
		for junc in junctions[donor]:
			index.write('>%s\n%s\n' % (junc.name, junc.sequence))
	index.close()
	shell('/data/csb/tools/bowtie-0.12.9/bowtie-build -q %s %s_index' %
		(index_fasta_path, out_prefix))
	
	# Align reads against junctions and tally junction read counts.
	shell('bowtie -v1 -B1 -p8 %s_index <(gunzip -c %s_1.fq.gz %s_2.fq.gz) '
		'> %s.bowtie' % (out_prefix, fastq_prefix, fastq_prefix, out_prefix))
	junction_by_name = {}
	for donor in junctions:
		for j in junctions[donor]: junction_by_name[j.name] = j
	for line in open('%s.bowtie' % out_prefix):
		cols = line[:-1].split('\t')
		if not '_' in cols[2]: continue
		junction_by_name[cols[2]].reads += 1
	
	# Calculate junction power relative to all outgoing links from donor
	for donor in junctions:
		total = sum(j.reads for j in junctions[donor])
		if total <= 0: continue
		for j in junctions[donor]:
			j.ratio = float(j.reads) / total
			if j.reads > 0:
				print('%s: %.1f%% (%d)' % (j.name, j.ratio*100, j.reads))
		
	# Check which exons actually participate in the mature transcripts
	active_edges = []
	for donor in junctions:
		for j in junctions[donor]:
			if j.ratio < 0.05: continue
			active_edges += [int(x[:-3]) for x in j.name.split('_')]
	
	exons = [[ex[0], ex[1], False] for ex in exons]
	ties = []
	for edge in set(active_edges):
		matches = [ex for ex in exons if edge in ex]
		if len(matches) == 1: matches[0][2] = True  # Unique match, mark active
		if len(matches) > 1: ties.append(matches)
	for tie in ties:
		if not any(ex[2] for ex in tie):
			for ex in tie: ex[2] = True   # If still tied, mark all tied active
	
	# Print exon map
	from svgfig import Rect, Frame, Poly
	rects = [Rect(ex[0], 1, ex[1], 2, stroke='none',
		fill='whitesmoke', stroke_linejoin='miter')
		for ex in exons if not ex[2]]
	rects += [Rect(ex[0], 1, ex[1], 2, stroke='none',
		fill='black', stroke_linejoin='miter') for ex in exons if ex[2]]
	lines = []
	for donor in junctions:
		for j in junctions[donor]:
			start, end = [int(x[:-3]) for x in j.name.split('_')]
			lines.append(Poly([(start,2), ((start+end)/2,3), (end,2)],
				stroke_opacity=j.ratio))
		
	Frame(genome_window[0], genome_window[1], 0, 10, *(rects+lines),
		width=500).SVG().save('%s.svg' % out_prefix)
	
	shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))
Пример #19
0
def variant_top_mutated_regions(vcf_path, region_size):
    if region_size % 2: error('Region size must be divisible by two.')
    step = region_size / 2

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line.rstrip('\n').split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    samples = headers[sample_col:]

    # Construct chromosome map
    chr_sizes = defaultdict(int)
    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')
        chr_sizes[cols[0]] = max(chr_sizes[cols[0]], int(cols[1]))
    vcf_file.close()

    mutated = {}  # Which samples are mutated in each bin
    variant_pos = {}  # Position of variant in bin, -1 if various
    for chr in chr_sizes:
        mutated[chr] = np.zeros((chr_sizes[chr] / step + 1, len(samples)),
                                dtype=np.bool)
        variant_pos[chr] = np.zeros(chr_sizes[chr] / step + 1, dtype=np.int32)

    # Reopen VCF file (might be compressed), identify columns
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    # Tally mutated samples in each region
    print('Tallying mutated samples...')
    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')
        pos = int(cols[1])
        bin = (pos - 1) / step

        vpos = variant_pos[cols[0]]
        vpos[bin] = -1 if vpos[bin] > 0 and vpos[bin] != pos else pos
        if bin > 0:
            vpos[bin -
                 1] = -1 if vpos[bin - 1] > 0 and vpos[bin - 1] != pos else pos

        mut = mutated[cols[0]]
        for s, gt in enumerate(cols[sample_col:]):
            if gt_symbols.index(gt.split(':')[0]) <= 1: continue
            mut[bin, s] = True
            if bin > 0: mut[bin - 1, s] = True

    # Convert mutation bitmasks into counts
    print('Convert to counts...')
    for chr in mutated:
        mutated[chr] = mutated[chr].sum(axis=1)

    # Print regions in descending order starting with highest recurrence
    print('Find maximum...')
    highest = 0
    for chr in mutated:
        highest = max(highest, max(mutated[chr]))

    print('Top regions with two or more mutated sites:')
    for n in range(highest, 1, -1):
        for chr in mutated:
            mut = mutated[chr]
            vpos = variant_pos[chr]
            for bin in range(len(mut)):
                if mut[bin] != n or vpos[bin] != -1: continue
                print('%s:%d-%d\t%d samples' %
                      (chr, bin * step + 1, bin * step + region_size, n))
Пример #20
0
def backup(rules_path, interactive):
	passwords = {}
	rules = []
	for line in open(rules_path):
		line = line.strip()
		if not line or line[0] == '#': continue
		tokens = line.strip().split()
		if len(tokens) != 2: error('Invalid rule: "%s"' % line)

		if not ':' in tokens[1]: error('Missing host: "%s"' % line)
		host, path = tokens[1].split(':')

		username = getpass.getuser()
		if '@' in host:
			username, host = host.split('@')

		if not os.path.isdir(tokens[0]):
			print('Directory %s does not exist. Ignoring rule...' % tokens[0])
			continue

		rule = Object()
		rule.src_dir = tokens[0]
		rule.dst_host = host
		rule.dst_dir = path
		rule.username = username
		rule.password = passwords[host] if host in passwords else \
			getpass.getpass('Password for %s: ' % host)
		passwords[host] = rule.password
		rules.append(rule)

	def lftp_mirror(rule, dry_run=False):

		cmds = open('.lftp_script', 'w')
		cmds.write('open -u %s,%s sftp://%s\n' % (
			rule.username, rule.password, rule.dst_host))
		cmds.write('mirror -P3 -Rae %s %s %s\n' % (
			'--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir))
		cmds.close()

		if dry_run:
			userpass = rule.username + ':' + rule.password + '@'
			host = rule.dst_host

			out = shell_stdout('lftp -f .lftp_script')
			for line in out:
				if line.startswith('chmod'): continue
				if line.startswith('mkdir'): continue

				m = re.match('get -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('ADD %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('UPDATE %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('rm .*sftp://(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('DELETE %s' % dst)
					continue

				sys.stdout.write(line)
		else:
			shell('lftp -f .lftp_script')

		os.remove('.lftp_script')
	
	for rule in rules:
		lftp_mirror(rule, dry_run=True)
			
	if interactive:
		if not raw_input('Proceed with backup? [y/N] ').lower() in ('y','yes'):
			error('Backup canceled.')
	
	for rule in rules:
		lftp_mirror(rule)
Пример #21
0
def detect_rearrangements(sam_path,
                          genome_path,
                          out_prefix,
                          anchor_len,
                          min_mapq,
                          orientation,
                          max_frag_len,
                          discard_duplicates='both-ends'):

    if not os.path.exists(sam_path):
        error('File %s does not exist.' % sam_path)

    if not discard_duplicates in ('no', 'both-ends', 'one-end'):
        error('Invalid duplicate discard method: %s' % discard_duplicates)

    detect_discordant_pairs(sam_path,
                            out_prefix,
                            max_frag_len=max_frag_len,
                            min_mapq=min_mapq,
                            orientation=orientation)

    # Execute split read analysis if the user has specified an anchor length.
    if anchor_len > 0:
        detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len)

    info('Sorting discordant pairs by chromosomal position...')
    sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix
    if anchor_len > 0:
        sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' %
          (sort_tmp_dir, sort_inputs, out_prefix))

    def report_rearrangement(out, r):
        if discard_duplicates == 'both-ends':
            discard_duplicates_both_ends(r)
        elif discard_duplicates == 'one-end':
            discard_duplicates_one_end(r)
        if len(r.reads) < 2: return 0
        out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' %
                  (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos,
                   sum([read[2] == None for read in r.reads]),
                   sum([read[2] != None for read in r.reads]), ';'.join(
                       [read[2] for read in r.reads if read[2] != None])))
        return 1

    info('Identifying rearrangements based on clusters of discordant reads...')

    out = open('%s.sv' % out_prefix, 'w')
    out.write(sv_file_header + '\n')

    N = 0
    rearrangements = []
    for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix):
        al = line[:-1].split('\t')

        chr = al[0]
        strand = al[1]
        pos = int(al[2])
        mchr = al[3]
        mstrand = al[4]
        mpos = int(al[5])
        seq = None if al[6] == '-' else al[6]

        # Rearrangements that are too far need not be considered in the future
        reachable = []
        for r in rearrangements:
            if pos - r.pos > max_frag_len:
                N += report_rearrangement(out, r)
            else:
                reachable.append(r)
        rearrangements = reachable

        # Check if we already have a rearrangement that matches the new pair.
        # We don't check the distance for the first mate because we already
        # know from above the rearrangements near it.
        matches = [
            r for r in rearrangements
            if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr
            and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand
        ]

        read = (pos, mpos, seq)
        if matches:
            for match in matches:
                match.reads.append(read)

        else:
            # No suitable rearrangements, create a new one.
            rearrangements.append(
                Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read))

    for r in rearrangements:
        N += report_rearrangement(out, r)

    info('Found %d rearrangements with at least 2 reads of evidence.' % N)
Пример #22
0
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq,
                            orientation):

    out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w')
    N = 0

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    # Go through all the first mates and look for discordant pairs.
    info('Searching for discordant read pairs...')
    prev = ['']
    for line in shell_stdout(
            'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' %
        (min_mapq, sam_path, max_frag_len, sort_tmp_dir)):

        al = line.split('\t')
        if len(al) < 9: continue

        # Discard spliced and clipped reads.
        # FIXME: Add support for spliced RNA-seq reads.
        if 'N' in al[5] or 'S' in al[5]: continue

        if al[0].endswith('/1') or al[0].endswith('/2'):
            al[0] = al[0][:-2]  # Remove /1 or /2 suffix

        if al[0] != prev[0]:
            prev = al
            continue

        flags = int(al[1])
        chr = al[2]
        mchr = prev[2]
        strand = '-' if flags & 0x10 else '+'
        mstrand = '-' if flags & 0x20 else '+'
        pos = int(al[3])
        mpos = int(prev[3])
        rlen = len(al[9])
        mrlen = len(prev[9])

        if not chr.startswith('chr'): chr = 'chr' + chr
        if not mchr.startswith('chr'): mchr = 'chr' + mchr

        if chr == 'chrM' or mchr == 'chrM': continue  # Discard mitochondrial

        if orientation == 'fr':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip second mate).
            mstrand = '-' if mstrand == '+' else '+'

        elif orientation == 'rf':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip first mate).
            strand = '-' if strand == '+' else '+'

        elif orientation == 'ff':
            # Reorient pairs so that the first mate is always upstream.
            # If mates are swapped, both mates must be reversed.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = '+' if mstrand == '-' else '-', \
                 '+' if strand == '-' else '-'

        else:
            error('Unsupported read orientation detected.')

        # Make positions represent read starts.
        if strand == '-': pos += rlen - 1
        if mstrand == '-': mpos += mrlen - 1

        # Each discordant mate pair is represented as a 7-tuple
        # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None).
        # The None at the end signifies that this is a mate pair.
        # Positions are 1-based and represent read starts.
        out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' %
                  (chr, strand, pos, mchr, mstrand, mpos))
        N += 1

    out.close()
    info('Found %d discordant mate pairs.' % N)
Пример #23
0
def variant_call(bam_paths, genome_path, options):

    if not os.path.exists(genome_path):
        error('Could not find genome FASTA file %s.' % genome_path)

    if options.region:
        for bam_path in bam_paths:
            if not os.path.exists(bam_path + '.bai'):
                error('No index found for BAM file %s.' % bam_path)

    samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]
    print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))

    ignore_mapq = [False] * len(samples)
    if options.ignore_mapq:
        for s, sample in enumerate(samples):
            if re.search(options.ignore_mapq, sample) != None:
                ignore_mapq[s] = True
                info('Ignoring mapping quality for sample %s.' % sample)

    for line in simple_pileup(bam_paths,
                              genome_path,
                              min_mapq=options.min_mapq,
                              min_alt_alleles=(0 if options.keep_all else
                                               options.min_hetz_reads),
                              region=options.region):

        tokens = line[:-1].split('\t')
        if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
        if tokens[2] == 'N': continue
        pileups = [p.split(' ') for p in tokens[3:]]

        #total_reads = np.zeros(len(samples))
        #allele_reads = defaultdict(lambda: np.zeros(len(samples)))

        total_reads = [0] * len(samples)
        allele_reads = defaultdict(lambda: [0] * len(samples))

        for s, pileup in enumerate(pileups):
            if len(pileup) < 3: continue
            for a in range(0, len(pileup), 3):
                count = int(pileup[a+1]) + \
                 (int(pileup[a+2]) if ignore_mapq[s] else 0)
                total_reads[s] += count
                if pileup[a] != '.': allele_reads[pileup[a]][s] = count

        # Call genotypes for each allele.
        for alt, reads in allele_reads.iteritems():
            genotypes = call_genotypes(reads, total_reads, options)
            if not options.keep_all and all(gt < 2 for gt in genotypes):
                continue

            gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
                      for s, g in enumerate(genotypes))

            # Reformat indels in VCF4 format
            ref = tokens[2]
            if len(alt) >= 2:
                if alt[1] == '+':  # Insertion
                    alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
                elif alt[1] == '-':  # Deletion
                    ref += alt[2:]
                    alt = (ref[0] if alt[0] == '.' else alt[0])

            print('%s\t%s\t%s\t%s\t%s' %
                  (tokens[0], tokens[1], ref, alt.upper(), '\t'.join(gtypes)))
Пример #24
0
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options):
	#print(bam_paths, genome_path, options.region, options.homz)
	gt_symbols = ['', '0/0', '0/1', '1/1']
	if not os.path.exists(genome_path):
		error('Could not find genome FASTA file %s.' % genome_path)

	if options.region:
		for bam_path in bam_paths:
			if not os.path.exists(bam_path + '.bai'):
				error('No index found for BAM file %s.' % bam_path)
	
	samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]

	# print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))
	print('CHROM\tPOSITION\tREF\tALT\t%s' %samples[0])

	ignore_mapq = [False] * len(samples)
	if options.ignore_mapq:
		for s, sample in enumerate(samples):
			if re.search(options.ignore_mapq, sample) != None:
				ignore_mapq[s] = True
				info('Ignoring mapping quality for sample %s.' % sample)
	
	for line in simple_pileup(bam_paths, genome_path, kgenomes_path,
		min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads),
		region=options.region):

		if type(line) == bytes:
			line = line.decode('utf8')

		tokens = line[:-1].split('\t')
		if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
		if tokens[2] == 'N': continue
		pileups = [p.split(' ') for p in tokens[3:]]
		#total_reads = np.zeros(len(samples))
		#allele_reads = defaultdict(lambda: np.zeros(len(samples)))

		total_reads = [0] * len(samples)
		allele_reads = defaultdict(lambda: [0] * len(samples))

		for s, pileup in enumerate(pileups):
			if len(pileup) < 3: continue
			for a in range(0, len(pileup), 3):
				count = int(pileup[a+1]) + \
					(int(pileup[a+2]) if ignore_mapq[s] else 0)
				total_reads[s] += count
				if pileup[a] != '.': allele_reads[pileup[a]][s] = count		

		# Call genotypes for each allele.
		# for alt, reads in allele_reads.iteritems():
		for alt, reads in allele_reads.items():
			genotypes = call_genotypes(reads, total_reads, options)

			# if not options.keep_all and all(gt < 2 for gt in genotypes): continue
			# if all(gt != 2 for gt in genotypes): continue
			if genotypes[1] != 2: continue
			
			gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
				for s, g in enumerate(genotypes))
			# Reformat indels in VCF4 format
			ref = tokens[2]
			if len(alt) >= 2:
				if alt[1] == '+':    # Insertion
					alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
				elif alt[1] == '-':  # Deletion
					ref += alt[2:]
					alt = (ref[0] if alt[0] == '.' else alt[0])
			
			#######################
			## Hetrozygous bases ##
			#######################
			
			gt_list = list(gtypes)
			gt_col = gt_list[1] ## genotype for the normal sample
			genotype = gt_symbols.index(gt_col[:gt_col.find(':')])
			total_read = float(gt_col.split(':')[2])
			if not (genotype == 2 and total_read >= 15): continue
			
			#########################
			## calculating the BAF ##
			#########################
			
			read = gt_list[0].split(':')[1:3] ## reads for the tumor sample
			sys.stdout.write('\t'.join([tokens[0], tokens[1], ref, alt.upper()]))
			alt, total = float(read[0]), int(read[1])
			sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total))
			sys.stdout.write('\n')
Пример #25
0
 elif args['reads'] and args['--raw']:
     sam_reads_raw(args['<bam_file>'], args['<out_prefix>'])
 elif args['reads']:
     sam_reads(args['<bam_file>'], args['<out_prefix>'])
 elif args['compact']:
     sam_compact(args['<bam_file>'])
 elif args['discordant'] and args['pairs']:
     sam_discordant_pairs(args['<bam_file>'],
                          int(args['<max_frag_size>']),
                          orientation=args['--orientation'],
                          min_mapq=int(args['--min-mapq']))
 elif args['fragments']:
     sam_fragments(args['<bam_file>'], int(args['<max_frag_len>']))
 elif args['read'] and args['length']:
     read_len = read_length(args['<bam_file>'])
     if not read_len: error('Could not determine read length.')
     else: print('%d' % read_len)
 elif args['pileup'] and args['each']:
     sam_pileup_each(args['<vcf_file>'],
                     args['<bam_files>'],
                     min_al_quality=int(args['--min-mapq']))
 elif args['pileup']:
     sam_pileup(args['<region>'],
                args['<bam_files>'],
                min_al_quality=int(args['--min-mapq']))
 elif args['count']:
     sam_count(args['<bam_file>'], args['<bed_file>'])
 elif args['counts'] and args['merge']:
     sam_merge_counts(args['<bed_file>'], args['<count_files>'])
 elif args['fragment'] and args['lengths']:
     sam_fragment_lengths(args['<bam_file>'])