Exemplo n.º 1
0
def calculate_depth_distribution(bedgraph_file, reference_assembly,
                                 output_depth_distribution,
                                 output_filtered_regions, ploidy,
                                 cnv_bedgraph_file, p_threshold, merge_window):
    '''
    Calculate distribution of depths in a bedGraph file.
    '''
    mu, sigma = calculate_depth_distribution_bedgraph(
        bedgraph_file,
        output_depth_distribution,
        ploidy,
        cnv_bedgraph_file,
    )
    chrom_sizes = read_chrom_sizes(reference_assembly)
    if output_filtered_regions:
        filter_regions_by_depth_bedgraph(
            bedgraph_file,
            chrom_sizes,
            mu,
            sigma,
            output_filtered_regions,
            ploidy,
            cnv_bedgraph_file,
            p_threshold,
            merge_window,
        )
Exemplo n.º 2
0
def calculate_depth_ratios(input_bedgraph,
                           reference_assembly,
                           output_file,
                           mean=None,
                           ploidy=2,
                           bin_size=500):
    '''
    For binned depths across the input bedGraph file, calculate the ratio
    relative to the mean.
    '''
    if not mean:
        mean = calculate_depth_distribution_bedgraph(input_bedgraph,
                                                     os.devnull)[0]
    chrom_sizes = read_chrom_sizes(reference_assembly)

    binned_values = defaultdict(list)

    with open(input_bedgraph) as f:
        for line in f:
            chromosome, start, end, depth = line.strip().split('\t')
            start = int(start)
            end = int(end)
            depth = float(depth)

            if depth != 0:
                depth = depth / ploidy
                for position in range(start + 1, end + 1):
                    distance_from_chrom_start = end - 1
                    distance_from_chrom_end = chrom_sizes[chromosome] - end
                    min_distance = min(distance_from_chrom_start,
                                       distance_from_chrom_end)

                    if depth / mean >= 0.25 and depth / mean <= 4:
                        bin_index = math.floor(float(min_distance) / 500)
                        _bin = (
                            bin_index * 500,
                            (bin_index + 1) * 500 - 1,
                        )
                        binned_values[_bin].append(depth / mean)

    output_values = []
    for _bin, values in sorted(binned_values.items(), key=lambda x: x[0][0]):
        output_values.append({
            'Bin Start': int(_bin[0]),
            'Bin End': int(_bin[1]),
            'Bin Center': int(math.ceil(float(sum(_bin)) / 2)),
            'Median Value': numpy.median(values),
        })

    with open(output_file, 'w') as OUT:
        fieldnames = ['Bin Start', 'Bin End', 'Bin Center', 'Median Value']
        writer = csv.DictWriter(OUT, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for row in output_values:
            writer.writerow(row)
Exemplo n.º 3
0
def correct_depths(y_int, scalar, mean_log, sd_log, slope, reference_assembly,
                   input_bedgraph, output_bedgraph):
    '''
    Correct values in a depth bedGraph file.

    Correction is performed using the sum of a log-normal cumulative
    distribution function and linear function.
    '''
    chrom_sizes = read_chrom_sizes(reference_assembly)
    write_corrected_bedgraph(
        input_bedgraph,
        chrom_sizes,
        output_bedgraph,
        y_int,
        scalar,
        mean_log,
        sd_log,
        slope,
    )
Exemplo n.º 4
0
def call_mutations(reference_assembly,
                   control_sample,
                   sample_list,
                   input_vcf,
                   output_header,
                   excluded_regions=None,
                   fwer=0.01):
    '''
    Using a reference assembly, sample list, and VCF file from GATK
    HaplotypeCaller, call mutations.

    chrom_sizes -- If specified, chromosome sizes are taken from here.
    excluded_regions -- Regions to exclude from variant calling (BED format).
    '''
    repeat_file = '{}.repeats'.format(os.path.splitext(reference_assembly)[0])

    if not reference.check_reference_indices(reference_assembly):
        sys.stderr.write('Reference assembly not indexed. Run '
                         '"muver index_reference".\n')
        exit()
    if not os.path.exists(repeat_file):
        sys.stderr.write('Repeats not found for reference assembly. Run '
                         '"muver create_repeat_file".\n')
        exit()

    samples = sample.read_samples_from_text(sample_list)
    control_sample = next(
        (x for x in samples if x.sample_name == control_sample),
        None,
    )

    chrom_sizes = reference.read_chrom_sizes(reference_assembly)

    variants = variant_list.VariantList(input_vcf, samples, excluded_regions,
                                        repeat_file, control_sample,
                                        chrom_sizes, fwer)

    text_output = '{}.mutations.txt'.format(output_header)
    vcf_output = '{}.mutations.vcf'.format(output_header)

    variants.write_output_table(text_output)
    variants.write_output_vcf(vcf_output)
Exemplo n.º 5
0
def run_pipeline(reference_assembly,
                 fastq_list,
                 control_sample,
                 experiment_directory,
                 p=1,
                 excluded_regions=None,
                 fwer=0.01,
                 max_records=1000000):
    '''
    Run the MuVer pipeline considering input FASTQ files.  All files written
    to the experiment directory.
    '''
    repeat_file = '{}.repeats'.format(os.path.splitext(reference_assembly)[0])

    if not reference.check_reference_indices(reference_assembly):
        sys.stderr.write('Reference assembly not indexed. Run '
                         '"muver index_reference".\n')
        exit()
    if not os.path.exists(repeat_file):
        sys.stderr.write('Repeats not found for reference assembly. Run '
                         '"muver create_repeat_file".\n')
        exit()

    pool = Pool(p)

    generate_experiment_directory(experiment_directory)
    samples = read_samples_from_text(fastq_list, exp_dir=experiment_directory)
    control_sample = next(
        (x for x in samples if x.sample_name == control_sample),
        None,
    )

    for sample in samples:
        sample.generate_intermediate_files()

    # Align
    for sample in samples:
        for i, fastqs in enumerate(sample.fastqs):
            if len(fastqs) == 2:
                f1, f2 = fastqs
            else:
                f1 = fastqs[0]
                f2 = None
            bowtie2.align(f1,
                          reference_assembly,
                          sample._sams[i].name,
                          fastq_2=f2,
                          p=p)

    # Process output SAM files
    pool.map(
        process_sams,
        zip(
            [s.sample_name for s in samples],
            [s.get_intermediate_file_names() for s in samples],
            repeat(reference_assembly),
            repeat(max_records),
        ))

    # Run HaplotypeCaller
    haplotype_caller_vcf = os.path.join(experiment_directory, 'gatk_output',
                                        'haplotype_caller_output.vcf')
    haplotype_caller_log = os.path.join(experiment_directory, 'logs',
                                        'haplotype_caller.log')
    bams = [s.merged_bam for s in samples]
    gatk.run_haplotype_caller(
        bams,
        reference_assembly,
        haplotype_caller_vcf,
        haplotype_caller_log,
        nct=p,
    )

    chrom_sizes = reference.read_chrom_sizes(reference_assembly)

    strand_bias_std_values = pool.map(
        analyze_depth_distribution,
        zip(
            range(len(samples)),
            [s.get_intermediate_file_names() for s in samples],
            repeat(reference_assembly),
            repeat(chrom_sizes),
            [s.ploidy for s in samples],
            [s.cnv_regions for s in samples],
        ))
    for index, strand_bias_std in strand_bias_std_values:
        samples[index].strand_bias_std = strand_bias_std

    # Characterize repeats

    if os.path.isfile(repeat_file + '.sample'):
        repeats = read_repeats(repeat_file + '.sample')
    else:
        repeats = read_repeats(repeat_file)

    pool.map(
        characterize_repeat_indel_rates,
        zip(
            [s.get_intermediate_file_names() for s in samples],
            repeat(repeats),
            [s.repeat_indel_header for s in samples],
        ))
    for sample in samples:
        sample.repeat_indel_fits_dict = read_fits(sample.repeat_indel_fits)

    variants = VariantList(haplotype_caller_vcf, samples, excluded_regions,
                           repeat_file, control_sample, chrom_sizes, fwer)

    text_output = os.path.join(experiment_directory, 'output', 'mutations.txt')
    vcf_output = os.path.join(experiment_directory, 'output', 'mutations.vcf')
    variants.write_output_table(text_output)
    variants.write_output_vcf(vcf_output)

    for sample in samples:
        sample.clear_temp_file_indices()

    sample_info_file = os.path.join(experiment_directory, 'sample_info.txt')
    write_sample_info_file(samples, sample_info_file)